Merge branch 'develop' into risc-v
This commit is contained in:
@@ -41,6 +41,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
# a bit of metaprogramming here to pull out the appropriate KERNEL var
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type})
|
||||
if (DEFINED ${float_char}MAXKERNEL)
|
||||
@@ -88,11 +91,67 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE")
|
||||
|
||||
if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SASUMKERNEL}" "" "asum_k" false "" "" false "SINGLE")
|
||||
if (DEFINED SMAXKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SMAXKERNEL}" "" "max_k" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
if (DEFINED SMINKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SMINKERNEL}" "USE_MIN" "min_k" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
if (DEFINED ISMINKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${ISMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
if (DEFINED ISMAXKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${ISMAXKERNEL}" "" "i*max_k" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${ISAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${ISAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SSCALKERNEL}" "" "scal_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SSWAPKERNEL}" "" "swap_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SAXPYKERNEL}" "" "axpy_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "" "asum_k" false "" "" false "DOUBLE")
|
||||
if (DEFINED DMAXKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "" "max_k" false "" "" false "DOUBLE")
|
||||
endif ()
|
||||
if (DEFINED DMINKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "USE_MIN" "min_k" false "" "" false "DOUBLE")
|
||||
endif ()
|
||||
if (DEFINED IDMINKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "DOUBLE")
|
||||
endif ()
|
||||
if (DEFINED IDMAXKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "" "i*max_k" false "" "" false "DOUBLE")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "" "scal_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE")
|
||||
endif ()
|
||||
|
||||
# Makefile.L2
|
||||
GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3)
|
||||
GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type})
|
||||
@@ -118,23 +177,86 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false ${float_type})
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DGEMVNKERNEL}" "" "gemv_n" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "DOUBLE")
|
||||
endif ()
|
||||
if (BUILD_COMPLEX AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) )
|
||||
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9))
|
||||
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
foreach (float_type SINGLE DOUBLE)
|
||||
set(USE_DIRECT_SGEMM false)
|
||||
if (X86_64)
|
||||
set(USE_DIRECT_SGEMM true)
|
||||
endif()
|
||||
|
||||
if (USE_DIRECT_SGEMM)
|
||||
# if (NOT DEFINED SGEMMDIRECTKERNEL)
|
||||
set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c)
|
||||
set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c)
|
||||
# endif()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
|
||||
endif()
|
||||
|
||||
foreach (float_type SINGLE DOUBLE BFLOAT16)
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
if (NOT ${BUILD_BFLOAT16})
|
||||
continue ()
|
||||
else ()
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
|
||||
endforeach()
|
||||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE")
|
||||
if (DGEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "${DGEMMINCOPYOBJ}" false "" "" true "DOUBLE")
|
||||
endif ()
|
||||
if (DGEMMITCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DGEMMITCOPY}" "DOUBLE" "${DGEMMITCOPYOBJ}" false "" "" true "DOUBLE")
|
||||
endif ()
|
||||
if (DGEMMONCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DGEMMONCOPY}" "DOUBLE" "${DGEMMONCOPYOBJ}" false "" "" true "DOUBLE")
|
||||
endif ()
|
||||
if (DGEMMOTCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "DOUBLE" "${DGEMMOTCOPYOBJ}" false "" "" true "DOUBLE")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE")
|
||||
endif ()
|
||||
if ((BUILD_DOUBLE OR BUILD_COMPLEX) AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMKERNEL}" "" "gemm_kernel" false "" "" false "SINGLE")
|
||||
if (SGEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE")
|
||||
endif ()
|
||||
if (SGEMMITCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE")
|
||||
endif ()
|
||||
if (SGEMMONCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE")
|
||||
endif ()
|
||||
if (SGEMMOTCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (${float_char}GEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
|
||||
endif ()
|
||||
@@ -469,10 +591,38 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
#geadd
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
|
||||
endforeach ()
|
||||
if (BUILD_DOUBLE AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false "SINGLE")
|
||||
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false "SINGLE")
|
||||
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false "SINGLE")
|
||||
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
|
||||
# Makefile.LA
|
||||
if(NOT NO_LAPACK)
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}NEG_TCOPY)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X")
|
||||
set(${float_char}NEG_TCOPY ../generic/zneg_tcopy.c)
|
||||
@@ -492,6 +642,28 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}NEG_TCOPY}_${${float_char}GEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}LASWP_NCOPY}_${${float_char}GEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false ${float_type})
|
||||
endforeach()
|
||||
if (BUILD_COMPLEX AND NOT BUILD_SINGLE)
|
||||
if (NOT DEFINED SNEG_TCOPY)
|
||||
set(SNEG_TCOPY ../generic/neg_tcopy.c)
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED SLASWP_NCOPY)
|
||||
set(SLASWP_NCOPY ../generic/laswp_ncopy.c)
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}_${SGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}_${SGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "SINGLE")
|
||||
endif()
|
||||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
if (NOT DEFINED DNEG_TCOPY)
|
||||
set(DNEG_TCOPY ../generic/neg_tcopy.c)
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED DLASWP_NCOPY)
|
||||
set(DLASWP_NCOPY ../generic/laswp_ncopy.c)
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "DOUBLE")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (${DYNAMIC_ARCH})
|
||||
@@ -516,12 +688,154 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
# a bit of metaprogramming here to pull out the appropriate KERNEL var
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type})
|
||||
GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type})
|
||||
endforeach ()
|
||||
|
||||
|
||||
if (BUILD_COMPLEX AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("generic/neg_tcopy_${SGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/laswp_ncopy_${SGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
endif ()
|
||||
if (BUILD_DOUBLE AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("generic/neg_tcopy_${SGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/laswp_ncopy_${SGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE")
|
||||
|
||||
if (SGEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE")
|
||||
endif ()
|
||||
if (SGEMMITCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE")
|
||||
endif ()
|
||||
if (SGEMMONCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE")
|
||||
endif ()
|
||||
if (SGEMMOTCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
|
||||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE")
|
||||
GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE")
|
||||
endif ()
|
||||
if (BUILD_COMPLEX16 AND NOT BUILD_COMPLEX)
|
||||
GenerateNamedObjects("${KERNELDIR}/${CAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "COMPLEX")
|
||||
if (DEFINED CMAXKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${CMAXKERNEL}" "" "max_k" false "" "" false "COMPLEX")
|
||||
endif ()
|
||||
if (DEFINED CMINKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${CMINKERNEL}" "USE_MIN" "min_k" false "" "" false "COMPLEX")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${ICAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${ICAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "COMPLEX")
|
||||
if (DEFINED ICMAXKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${ICMAXKERNEL}" "" "i*max_k" false "" "" false "COMPLEX")
|
||||
endif ()
|
||||
if (DEFINED ICMINKERNEL)
|
||||
GenerateNamedObjects("${KERNELDIR}/${ICMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "COMPLEX")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${CASUMKERNEL}" "" "asum_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CAXPYKERNEL}" "" "axpy_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CNRM2KERNEL}" "" "nrm2_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CROTKERNEL}" "" "rot_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CSCALKERNEL}" "" "scal_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CSWAPKERNEL}" "" "swap_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CAXPBYKERNEL}" "" "axpby_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CSUMKERNEL}" "" "sum_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CAXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CDOTKERNEL}" "" "dotu_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CDOTKERNEL}" "CONJ" "dotc_k" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "" "gemv_n" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "TRANSA" "gemv_t" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "CONJ" "gemv_r" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "CONJ;TRANSA" "gemv_c" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "XCONJ" "gemv_o" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "XCONJ;TRANSA" "gemv_u" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "XCONJ;CONJ" "gemv_s" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "XCONJ;CONJ;TRANSA" "gemv_d" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL;CONJ" "trsm_kernel_LR" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LT}" "LT;TRSMKERNEL;CONJ" "trsm_kernel_LC" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RR" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RT}" "RT;TRSMKERNEL;CONJ" "trsm_kernel_RC" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "NN" "gemm_kernel_n" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "CN" "gemm_kernel_l" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "NC" "gemm_kernel_r" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "CC" "gemm_kernel_b" false "" "" false "COMPLEX")
|
||||
if (CGEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMMINCOPY}" "COMPLEX" "${CGEMMINCOPYOBJ}" false "" "" true "COMPLEX")
|
||||
endif ()
|
||||
|
||||
if (CGEMMITCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMMITCOPY}" "COMPLEX" "${CGEMMITCOPYOBJ}" false "" "" true "COMPLEX")
|
||||
endif ()
|
||||
|
||||
if (CGEMMONCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMMONCOPY}" "COMPLEX" "${CGEMMONCOPYOBJ}" false "" "" true "COMPLEX")
|
||||
endif ()
|
||||
|
||||
if (CGEMMOTCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMMOTCOPY}" "COMPLEX" "${CGEMMOTCOPYOBJ}" false "" "" true "COMPLEX")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${CGEMM_BETA}" "" "gemm_beta" false "" "" false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
|
||||
GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
|
||||
GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "COMPLEX")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
add_library(kernel${TSUFFIX} OBJECT ${OPENBLAS_SRC})
|
||||
set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}")
|
||||
@@ -536,7 +850,7 @@ if (${DYNAMIC_ARCH})
|
||||
set(BUILD_KERNEL 1)
|
||||
set(KDIR "")
|
||||
set(TSUFFIX "_${TARGET_CORE}")
|
||||
set(KERNEL_DEFINITIONS "-DBUILD_KERNEL -DTABLE_NAME=gotoblas_${TARGET_CORE} -DTS=${TSUFFIX}")
|
||||
set(KERNEL_DEFINITIONS "-DBUILD_KERNEL -DTABLE_NAME=gotoblas_${TARGET_CORE} -DTS=${TSUFFIX}")
|
||||
build_core("${TARGET_CORE}" "${KDIR}" "${TSUFFIX}" "${KERNEL_DEFINITIONS}")
|
||||
set(ADD_COMMONOBJS 0)
|
||||
endforeach()
|
||||
|
||||
@@ -5,11 +5,23 @@ endif
|
||||
TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
ifdef HAVE_SSE3
|
||||
CFLAGS += -msse3
|
||||
endif
|
||||
ifdef HAVE_SSSE3
|
||||
CFLAGS += -mssse3
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
override CFLAGS += -fno-integrated-as
|
||||
endif
|
||||
endif
|
||||
AVX2OPT =
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
@@ -32,7 +44,28 @@ ifdef NO_AVX2
|
||||
endif
|
||||
|
||||
ifdef TARGET_CORE
|
||||
ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3))
|
||||
override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON))
|
||||
override CFLAGS += -msse -msse2
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
override CFLAGS += -march=cooperlake
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
|
||||
@@ -262,6 +262,20 @@ ifndef XDOTKERNEL
|
||||
XDOTKERNEL = zdot.S
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
ifndef SBDOTKERNEL
|
||||
SBDOTKERNEL = ../x86_64/sbdot.c
|
||||
endif
|
||||
|
||||
ifndef TOBF16KERNEL
|
||||
TOBF16KERNEL = ../x86_64/tobf16.c
|
||||
endif
|
||||
|
||||
ifndef BF16TOKERNEL
|
||||
BF16TOKERNEL = ../x86_64/bf16to.c
|
||||
endif
|
||||
endif
|
||||
|
||||
### NRM2 ###
|
||||
|
||||
ifndef SNRM2KERNEL
|
||||
@@ -516,6 +530,15 @@ XBLASOBJS += \
|
||||
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
|
||||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SBBLASOBJS += \
|
||||
sbdot_k$(TSUFFIX).$(SUFFIX)
|
||||
SBEXTOBJS += \
|
||||
sbstobf16_k$(TSUFFIX).$(SUFFIX) sbdtobf16_k$(TSUFFIX).$(SUFFIX)
|
||||
SBEXTOBJS += \
|
||||
sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
### AMAX ###
|
||||
|
||||
|
||||
@@ -734,6 +757,19 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
|
||||
$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
$(KDIR)sbdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sbdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
|
||||
$(KDIR)sbstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@
|
||||
$(KDIR)sbdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@
|
||||
$(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@
|
||||
$(KDIR)dbf16tod_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
|
||||
@@ -186,31 +186,46 @@ ifndef XHEMV_M_KERNEL
|
||||
XHEMV_M_KERNEL = ../generic/zhemv_k.c
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" ""
|
||||
SBLASOBJS += \
|
||||
sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \
|
||||
sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
SBLASOBJS += \
|
||||
ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \
|
||||
sger_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
ifeq ($(BUILD_DOUBLE),1)
|
||||
DBLASOBJS += \
|
||||
dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) dsymv_U$(TSUFFIX).$(SUFFIX) dsymv_L$(TSUFFIX).$(SUFFIX) \
|
||||
dger_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
QBLASOBJS += \
|
||||
qgemv_n$(TSUFFIX).$(SUFFIX) qgemv_t$(TSUFFIX).$(SUFFIX) qsymv_U$(TSUFFIX).$(SUFFIX) qsymv_L$(TSUFFIX).$(SUFFIX) \
|
||||
qger_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
SBLASOBJS += \
|
||||
sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX)
|
||||
CBLASOBJS += \
|
||||
cgemv_n$(TSUFFIX).$(SUFFIX) cgemv_t$(TSUFFIX).$(SUFFIX) cgemv_r$(TSUFFIX).$(SUFFIX) cgemv_c$(TSUFFIX).$(SUFFIX) \
|
||||
cgemv_o$(TSUFFIX).$(SUFFIX) cgemv_u$(TSUFFIX).$(SUFFIX) cgemv_s$(TSUFFIX).$(SUFFIX) cgemv_d$(TSUFFIX).$(SUFFIX) \
|
||||
csymv_U$(TSUFFIX).$(SUFFIX) csymv_L$(TSUFFIX).$(SUFFIX) \
|
||||
chemv_U$(TSUFFIX).$(SUFFIX) chemv_L$(TSUFFIX).$(SUFFIX) chemv_V$(TSUFFIX).$(SUFFIX) chemv_M$(TSUFFIX).$(SUFFIX) \
|
||||
cgeru_k$(TSUFFIX).$(SUFFIX) cgerc_k$(TSUFFIX).$(SUFFIX) cgerv_k$(TSUFFIX).$(SUFFIX) cgerd_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
CBLASOBJS += \
|
||||
cgemv_n$(TSUFFIX).$(SUFFIX) cgemv_t$(TSUFFIX).$(SUFFIX) cgemv_r$(TSUFFIX).$(SUFFIX) cgemv_c$(TSUFFIX).$(SUFFIX) \
|
||||
cgemv_o$(TSUFFIX).$(SUFFIX) cgemv_u$(TSUFFIX).$(SUFFIX) cgemv_s$(TSUFFIX).$(SUFFIX) cgemv_d$(TSUFFIX).$(SUFFIX)
|
||||
DBLASOBJS += \
|
||||
dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX)
|
||||
ZBLASOBJS += \
|
||||
zgemv_n$(TSUFFIX).$(SUFFIX) zgemv_t$(TSUFFIX).$(SUFFIX) zgemv_r$(TSUFFIX).$(SUFFIX) zgemv_c$(TSUFFIX).$(SUFFIX) \
|
||||
zgemv_o$(TSUFFIX).$(SUFFIX) zgemv_u$(TSUFFIX).$(SUFFIX) zgemv_s$(TSUFFIX).$(SUFFIX) zgemv_d$(TSUFFIX).$(SUFFIX) \
|
||||
zsymv_U$(TSUFFIX).$(SUFFIX) zsymv_L$(TSUFFIX).$(SUFFIX) \
|
||||
zhemv_U$(TSUFFIX).$(SUFFIX) zhemv_L$(TSUFFIX).$(SUFFIX) zhemv_V$(TSUFFIX).$(SUFFIX) zhemv_M$(TSUFFIX).$(SUFFIX) \
|
||||
zgeru_k$(TSUFFIX).$(SUFFIX) zgerc_k$(TSUFFIX).$(SUFFIX) zgerv_k$(TSUFFIX).$(SUFFIX) zgerd_k$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
XBLASOBJS += \
|
||||
xgemv_n$(TSUFFIX).$(SUFFIX) xgemv_t$(TSUFFIX).$(SUFFIX) xgemv_r$(TSUFFIX).$(SUFFIX) xgemv_c$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -219,17 +234,21 @@ XBLASOBJS += \
|
||||
xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \
|
||||
xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" ""
|
||||
$(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@
|
||||
|
||||
$(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DTRANS $< -o $@
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
|
||||
$(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@
|
||||
|
||||
$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -UTRANS $< -o $@
|
||||
@@ -237,6 +256,8 @@ $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
$(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DTRANS $< -o $@
|
||||
|
||||
|
||||
ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
|
||||
$(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@
|
||||
|
||||
@@ -260,6 +281,10 @@ $(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNE
|
||||
|
||||
$(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
|
||||
$(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@
|
||||
@@ -284,6 +309,7 @@ $(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNE
|
||||
|
||||
$(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@
|
||||
@@ -309,17 +335,25 @@ $(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNE
|
||||
$(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@
|
||||
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
||||
$(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@
|
||||
|
||||
$(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(BUILD_DOUBLE),1)
|
||||
|
||||
$(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $@
|
||||
|
||||
$(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $@
|
||||
@@ -327,17 +361,23 @@ $(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
$(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $@
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
|
||||
$(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $@
|
||||
|
||||
$(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
|
||||
$(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $@
|
||||
|
||||
$(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $@
|
||||
@@ -345,15 +385,23 @@ $(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
$(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $@
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
||||
$(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_DOUBLE),1)
|
||||
|
||||
$(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE $< -o $@
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
|
||||
$(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -UCONJ $< -o $@
|
||||
|
||||
@@ -365,6 +413,9 @@ $(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
|
||||
$(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -DCONJ -DXCONJ $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
|
||||
$(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -UCONJ $< -o $@
|
||||
@@ -377,6 +428,7 @@ $(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
|
||||
$(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -DCONJ -DXCONJ $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ $< -o $@
|
||||
@@ -390,6 +442,8 @@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
|
||||
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
|
||||
$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@
|
||||
|
||||
@@ -401,6 +455,9 @@ $(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
|
||||
$(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
|
||||
$(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $@
|
||||
@@ -413,7 +470,7 @@ $(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
|
||||
$(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $@
|
||||
|
||||
@@ -426,3 +483,4 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
$(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
|
||||
|
||||
|
||||
|
||||
@@ -9,6 +9,10 @@ ifeq ($(ARCH), x86_64)
|
||||
USE_GEMM3M = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), x86_64)
|
||||
USE_DIRECT_SGEMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), ia64)
|
||||
USE_GEMM3M = 1
|
||||
endif
|
||||
@@ -43,18 +47,28 @@ ifeq ($(CORE), SKYLAKEX)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), COOPERLAKE)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ZEN)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
ifeq ($(BINARY64),1)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER10)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
@@ -63,33 +77,75 @@ ifeq ($(CORE), Z14)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifdef USE_DIRECT_SGEMM
|
||||
ifndef SGEMMDIRECTKERNEL
|
||||
SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c
|
||||
SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
ifndef SBGEMMKERNEL
|
||||
SBGEMM_BETA = ../generic/gemm_beta.c
|
||||
SBGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SBGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
SBGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
SBGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SBKERNELOBJS += \
|
||||
sbgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \
|
||||
$(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ)
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
|
||||
SKERNELOBJS += \
|
||||
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
sgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
|
||||
$(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ)
|
||||
|
||||
ifdef USE_DIRECT_SGEMM
|
||||
SKERNELOBJS += \
|
||||
sgemm_direct$(TSUFFIX).$(SUFFIX) \
|
||||
sgemm_direct_performant$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
|
||||
DKERNELOBJS += \
|
||||
dgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
dgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \
|
||||
$(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ)
|
||||
endif
|
||||
|
||||
QKERNELOBJS += \
|
||||
qgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \
|
||||
$(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ)
|
||||
|
||||
ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
|
||||
CKERNELOBJS += \
|
||||
cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \
|
||||
$(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \
|
||||
$(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
ZKERNELOBJS += \
|
||||
zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \
|
||||
$(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \
|
||||
$(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ)
|
||||
endif
|
||||
|
||||
XKERNELOBJS += \
|
||||
xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -97,6 +153,9 @@ XKERNELOBJS += \
|
||||
$(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \
|
||||
$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SBBLASOBJS += $(SBKERNELOBJS)
|
||||
endif
|
||||
SBLASOBJS += $(SKERNELOBJS)
|
||||
DBLASOBJS += $(DKERNELOBJS)
|
||||
QBLASOBJS += $(QKERNELOBJS)
|
||||
@@ -104,38 +163,52 @@ CBLASOBJS += $(CKERNELOBJS)
|
||||
ZBLASOBJS += $(ZKERNELOBJS)
|
||||
XBLASOBJS += $(XKERNELOBJS)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" ""
|
||||
SBLASOBJS += \
|
||||
sgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_DOUBLE),1)
|
||||
DBLASOBJS += \
|
||||
dgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
QBLASOBJS += \
|
||||
qgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
CBLASOBJS += \
|
||||
cgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \
|
||||
ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \
|
||||
ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
|
||||
CBLASOBJS += \
|
||||
cgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
ZBLASOBJS += \
|
||||
zgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -145,7 +218,8 @@ ZBLASOBJS += \
|
||||
ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \
|
||||
ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \
|
||||
ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
XBLASOBJS += \
|
||||
xgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -156,7 +230,7 @@ XBLASOBJS += \
|
||||
xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \
|
||||
xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
|
||||
xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \
|
||||
xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(USE_GEMM3M), 1)
|
||||
|
||||
@@ -166,6 +240,7 @@ XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
SBLASOBJS += \
|
||||
strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -174,7 +249,10 @@ SBLASOBJS += \
|
||||
strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" ""
|
||||
SBLASOBJS += \
|
||||
strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -182,10 +260,15 @@ SBLASOBJS += \
|
||||
strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \
|
||||
strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
SBLASOBJS += \
|
||||
ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_DOUBLE),1)
|
||||
DBLASOBJS += \
|
||||
dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
|
||||
dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -205,6 +288,7 @@ DBLASOBJS += \
|
||||
dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
|
||||
dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
QBLASOBJS += \
|
||||
qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -224,8 +308,9 @@ QBLASOBJS += \
|
||||
qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \
|
||||
qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
|
||||
qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
CBLASOBJS += \
|
||||
ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
|
||||
ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -235,6 +320,13 @@ CBLASOBJS += \
|
||||
ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \
|
||||
ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
|
||||
csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
|
||||
CBLASOBJS += \
|
||||
ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -242,12 +334,10 @@ CBLASOBJS += \
|
||||
ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \
|
||||
ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
|
||||
csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX)
|
||||
ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
ZBLASOBJS += \
|
||||
ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
|
||||
ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -269,6 +359,7 @@ ZBLASOBJS += \
|
||||
zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \
|
||||
zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
XBLASOBJS += \
|
||||
xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -294,6 +385,7 @@ XBLASOBJS += \
|
||||
|
||||
ifeq ($(USE_GEMM3M), 1)
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
CBLASOBJS += \
|
||||
cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -313,7 +405,9 @@ CBLASOBJS += \
|
||||
chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
|
||||
chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
|
||||
chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
ZBLASOBJS += \
|
||||
zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -333,6 +427,7 @@ ZBLASOBJS += \
|
||||
zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
|
||||
zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
|
||||
zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
XBLASOBJS += \
|
||||
xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -357,20 +452,25 @@ XBLASOBJS += \
|
||||
endif
|
||||
|
||||
###### BLAS extensions #####
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
SBLASOBJS += \
|
||||
somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
sgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
ifeq ($(BUILD_DOUBLE),1)
|
||||
DBLASOBJS += \
|
||||
domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
dgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
CBLASOBJS += \
|
||||
comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -381,7 +481,9 @@ CBLASOBJS += \
|
||||
cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
|
||||
cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
|
||||
cgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
ZBLASOBJS += \
|
||||
zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
@@ -392,7 +494,14 @@ ZBLASOBJS += \
|
||||
zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
|
||||
zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
|
||||
zgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
endif
|
||||
|
||||
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
@@ -419,6 +528,11 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
@@ -437,12 +551,47 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
|
||||
$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
||||
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
|
||||
$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY)
|
||||
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s
|
||||
m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@
|
||||
rm sbgemmotcopy.s sbgemmotcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
|
||||
|
||||
$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s
|
||||
m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@
|
||||
rm sbgemmitcopy.s sbgemmitcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s
|
||||
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
|
||||
rm sgemmotcopy.s sgemmotcopy_nomacros.s
|
||||
@@ -458,7 +607,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
|
||||
|
||||
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s
|
||||
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
|
||||
rm sgemmitcopy.s sgemmitcopy_nomacros.s
|
||||
@@ -470,7 +619,7 @@ endif
|
||||
|
||||
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s
|
||||
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
|
||||
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
|
||||
@@ -488,7 +637,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
|
||||
|
||||
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s
|
||||
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
|
||||
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
|
||||
@@ -531,7 +680,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
|
||||
|
||||
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
|
||||
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s
|
||||
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
|
||||
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
|
||||
@@ -554,7 +703,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
|
||||
|
||||
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s
|
||||
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
|
||||
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
|
||||
@@ -586,7 +735,7 @@ endif
|
||||
|
||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s
|
||||
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
@@ -594,9 +743,29 @@ else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifdef USE_DIRECT_SGEMM
|
||||
$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
|
||||
$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s
|
||||
m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
|
||||
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
@@ -609,7 +778,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP
|
||||
|
||||
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s
|
||||
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
|
||||
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
|
||||
@@ -619,7 +788,7 @@ endif
|
||||
|
||||
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s
|
||||
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
|
||||
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
|
||||
@@ -629,7 +798,7 @@ endif
|
||||
|
||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
|
||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||
@@ -639,7 +808,7 @@ endif
|
||||
|
||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s
|
||||
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
|
||||
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
|
||||
@@ -649,7 +818,7 @@ endif
|
||||
|
||||
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s
|
||||
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
||||
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
||||
@@ -659,7 +828,7 @@ endif
|
||||
|
||||
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s
|
||||
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
||||
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
||||
@@ -669,7 +838,7 @@ endif
|
||||
|
||||
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s
|
||||
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
||||
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
||||
@@ -679,7 +848,7 @@ endif
|
||||
|
||||
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s
|
||||
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
||||
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
||||
@@ -703,7 +872,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
|
||||
ifdef USE_TRMM
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s
|
||||
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
|
||||
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
|
||||
@@ -713,7 +882,7 @@ endif
|
||||
|
||||
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s
|
||||
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
|
||||
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
|
||||
@@ -723,7 +892,7 @@ endif
|
||||
|
||||
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s
|
||||
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
|
||||
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
|
||||
@@ -733,7 +902,7 @@ endif
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
|
||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
@@ -743,7 +912,7 @@ endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s
|
||||
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
|
||||
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
|
||||
@@ -753,7 +922,7 @@ endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s
|
||||
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
|
||||
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
|
||||
@@ -763,7 +932,7 @@ endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s
|
||||
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
|
||||
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
|
||||
@@ -773,7 +942,7 @@ endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s
|
||||
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
|
||||
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
|
||||
@@ -795,7 +964,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
|
||||
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s
|
||||
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
|
||||
@@ -805,7 +974,7 @@ endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s
|
||||
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
|
||||
@@ -815,7 +984,7 @@ endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s
|
||||
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
|
||||
@@ -825,7 +994,7 @@ endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s
|
||||
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
|
||||
@@ -835,7 +1004,7 @@ endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s
|
||||
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
|
||||
@@ -845,7 +1014,7 @@ endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s
|
||||
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
|
||||
@@ -855,7 +1024,7 @@ endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s
|
||||
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
|
||||
@@ -865,7 +1034,7 @@ endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s
|
||||
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
|
||||
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
|
||||
@@ -875,7 +1044,7 @@ endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s
|
||||
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
||||
@@ -885,7 +1054,7 @@ endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s
|
||||
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
||||
@@ -895,7 +1064,7 @@ endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s
|
||||
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
||||
@@ -905,7 +1074,7 @@ endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s
|
||||
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
||||
@@ -915,7 +1084,7 @@ endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s
|
||||
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
||||
@@ -925,7 +1094,7 @@ endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s
|
||||
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
||||
@@ -935,7 +1104,7 @@ endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s
|
||||
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
||||
@@ -945,7 +1114,7 @@ endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s
|
||||
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
||||
@@ -965,7 +1134,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
|
||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
@@ -1099,7 +1268,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
|
||||
|
||||
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s
|
||||
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
|
||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
|
||||
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
|
||||
@@ -2210,6 +2379,11 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_
|
||||
$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
@@ -2225,6 +2399,24 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
|
||||
$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
||||
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
|
||||
$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
@@ -2241,7 +2433,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||
|
||||
endif
|
||||
|
||||
$(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||
@@ -2329,6 +2521,12 @@ endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
@@ -2346,7 +2544,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
|
||||
|
||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||
$(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
|
||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||
@@ -2392,7 +2590,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
|
||||
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
|
||||
@@ -61,20 +61,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
vldmia.f64 X!, { d4 }
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 1f /* KERNEL_F1_NEXT_\@ */
|
||||
vabs.f64 d4, d4
|
||||
vcmpe.f64 d0, d4 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
|
||||
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_F1_NEXT_\@
|
||||
bge 1f /* KERNEL_F1_NEXT_\@ */
|
||||
vdiv.f64 d2 , d0, d4 // scale / x
|
||||
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f64 d0 , d4 // scale = x
|
||||
|
||||
KERNEL_F1_NEXT_\@:
|
||||
1: /* KERNEL_F1_NEXT_\@: */
|
||||
|
||||
.endm
|
||||
|
||||
@@ -124,20 +124,20 @@ KERNEL_S1_NEXT:
|
||||
vldmia.f32 X!, { s4 }
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 1f /* KERNEL_F1_NEXT_\@ */
|
||||
vabs.f32 s4, s4
|
||||
vcmpe.f32 s0, s4 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
|
||||
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_F1_NEXT_\@
|
||||
bge 1f /* KERNEL_F1_NEXT_\@ */
|
||||
vdiv.f32 s2 , s0, s4 // scale / x
|
||||
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f32 s0 , s4 // scale = x
|
||||
|
||||
KERNEL_F1_NEXT_\@:
|
||||
1: /* KERNEL_F1_NEXT_\@: */
|
||||
|
||||
.endm
|
||||
|
||||
@@ -195,37 +195,37 @@ KERNEL_S1_NEXT:
|
||||
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 1f /* KERNEL_F1_NEXT_\@ */
|
||||
vabs.f64 d4, d4
|
||||
vcmpe.f64 d0, d4 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
|
||||
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_F1_NEXT_\@
|
||||
bge 1f /* KERNEL_F1_NEXT_\@ */
|
||||
vdiv.f64 d2 , d0, d4 // scale / x
|
||||
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f64 d0 , d4 // scale = x
|
||||
|
||||
KERNEL_F1_NEXT_\@:
|
||||
1: /* KERNEL_F1_NEXT_\@: */
|
||||
|
||||
vcmpe.f64 d5, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 2f /* KERNEL_F1_END_\@ */
|
||||
vabs.f64 d5, d5
|
||||
vcmpe.f64 d0, d5 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
|
||||
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_F1_END_\@
|
||||
bge 2f /* KERNEL_F1_END_\@ */
|
||||
vdiv.f64 d2 , d0, d5 // scale / x
|
||||
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f64 d0 , d5 // scale = x
|
||||
|
||||
KERNEL_F1_END_\@:
|
||||
2: /* KERNEL_F1_END_\@: */
|
||||
|
||||
|
||||
.endm
|
||||
@@ -253,37 +253,37 @@ KERNEL_F1_END_\@:
|
||||
|
||||
vcmpe.f64 d4, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq 1f /* KERNEL_S1_NEXT_\@ */
|
||||
vabs.f64 d4, d4
|
||||
vcmpe.f64 d0, d4 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
|
||||
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_S1_NEXT_\@
|
||||
bge 1f /* KERNEL_S1_NEXT_\@ */
|
||||
vdiv.f64 d2 , d0, d4 // scale / x
|
||||
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f64 d0 , d4 // scale = x
|
||||
|
||||
KERNEL_S1_NEXT_\@:
|
||||
1: /* KERNEL_S1_NEXT_\@: */
|
||||
|
||||
vcmpe.f64 d5, d6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_S1_END_\@
|
||||
beq 2f /* KERNEL_S1_END_\@ */
|
||||
vabs.f64 d5, d5
|
||||
vcmpe.f64 d0, d5 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
|
||||
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_S1_END_\@
|
||||
bge 2f /* KERNEL_S1_END_\@ */
|
||||
vdiv.f64 d2 , d0, d5 // scale / x
|
||||
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f64 d0 , d5 // scale = x
|
||||
|
||||
KERNEL_S1_END_\@:
|
||||
2: /* KERNEL_S1_END_\@: */
|
||||
|
||||
add X, X, INC_X
|
||||
|
||||
@@ -298,37 +298,37 @@ KERNEL_S1_END_\@:
|
||||
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 1f /* KERNEL_F1_NEXT_\@ */
|
||||
vabs.f32 s4, s4
|
||||
vcmpe.f32 s0, s4 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
|
||||
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_F1_NEXT_\@
|
||||
bge 1f /* KERNEL_F1_NEXT_\@ */
|
||||
vdiv.f32 s2 , s0, s4 // scale / x
|
||||
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f32 s0 , s4 // scale = x
|
||||
|
||||
KERNEL_F1_NEXT_\@:
|
||||
1: /* KERNEL_F1_NEXT_\@: */
|
||||
|
||||
vcmpe.f32 s5, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 2f /* KERNEL_F1_END_\@ */
|
||||
vabs.f32 s5, s5
|
||||
vcmpe.f32 s0, s5 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
|
||||
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_F1_END_\@
|
||||
bge 2f /* KERNEL_F1_END_\@ */
|
||||
vdiv.f32 s2 , s0, s5 // scale / x
|
||||
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f32 s0 , s5 // scale = x
|
||||
|
||||
KERNEL_F1_END_\@:
|
||||
2: /* KERNEL_F1_END_\@: */
|
||||
|
||||
|
||||
.endm
|
||||
@@ -354,37 +354,37 @@ KERNEL_F1_END_\@:
|
||||
|
||||
vcmpe.f32 s4, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq 1f /* KERNEL_S1_NEXT_\@ */
|
||||
vabs.f32 s4, s4
|
||||
vcmpe.f32 s0, s4 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
|
||||
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_S1_NEXT_\@
|
||||
bge 1f /* KERNEL_S1_NEXT_\@ */
|
||||
vdiv.f32 s2 , s0, s4 // scale / x
|
||||
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f32 s0 , s4 // scale = x
|
||||
|
||||
KERNEL_S1_NEXT_\@:
|
||||
1: /* KERNEL_S1_NEXT_\@: */
|
||||
|
||||
vcmpe.f32 s5, s6 // compare with 0.0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
beq KERNEL_S1_END_\@
|
||||
beq 2f /* KERNEL_S1_END_\@ */
|
||||
vabs.f32 s5, s5
|
||||
vcmpe.f32 s0, s5 // compare with scale
|
||||
vmrs APSR_nzcv, fpscr
|
||||
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
|
||||
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
|
||||
bge KERNEL_S1_END_\@
|
||||
bge 2f /* KERNEL_S1_END_\@ */
|
||||
vdiv.f32 s2 , s0, s5 // scale / x
|
||||
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
|
||||
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
|
||||
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
|
||||
vmov.f32 s0 , s5 // scale = x
|
||||
|
||||
KERNEL_S1_END_\@:
|
||||
2: /* KERNEL_S1_END_\@: */
|
||||
|
||||
add X, X, INC_X
|
||||
|
||||
|
||||
@@ -29,23 +29,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* trivial copy of asum.c with the ABS() removed *
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include "../simd/intrin.h"
|
||||
#include <math.h>
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG i = 0;
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
if (n <= 0 || inc_x <= 0)
|
||||
return (sumf);
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
if (inc_x == 1)
|
||||
{
|
||||
#if V_SIMD
|
||||
#ifdef DOUBLE
|
||||
const int vstep = v_nlanes_f64;
|
||||
const int unrollx2 = n & (-vstep * 2);
|
||||
const int unrollx = n & -vstep;
|
||||
v_f64 vsum0 = v_zero_f64();
|
||||
v_f64 vsum1 = v_zero_f64();
|
||||
while (i < unrollx2)
|
||||
{
|
||||
vsum0 = v_add_f64(vsum0, v_loadu_f64(x));
|
||||
vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep));
|
||||
i += vstep * 2;
|
||||
}
|
||||
vsum0 = v_add_f64(vsum0, vsum1);
|
||||
while (i < unrollx)
|
||||
{
|
||||
vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
|
||||
i += vstep;
|
||||
}
|
||||
sumf = v_sum_f64(vsum0);
|
||||
#else
|
||||
const int vstep = v_nlanes_f32;
|
||||
const int unrollx4 = n & (-vstep * 4);
|
||||
const int unrollx = n & -vstep;
|
||||
v_f32 vsum0 = v_zero_f32();
|
||||
v_f32 vsum1 = v_zero_f32();
|
||||
v_f32 vsum2 = v_zero_f32();
|
||||
v_f32 vsum3 = v_zero_f32();
|
||||
while (i < unrollx4)
|
||||
{
|
||||
vsum0 = v_add_f32(vsum0, v_loadu_f32(x));
|
||||
vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep));
|
||||
vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2));
|
||||
vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3));
|
||||
i += vstep * 4;
|
||||
}
|
||||
vsum0 = v_add_f32(
|
||||
v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3));
|
||||
while (i < unrollx)
|
||||
{
|
||||
vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
|
||||
i += vstep;
|
||||
}
|
||||
sumf = v_sum_f32(vsum0);
|
||||
#endif
|
||||
#else
|
||||
int n1 = n & -4;
|
||||
for (; i < n1; i += 4)
|
||||
{
|
||||
sumf += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
while (i < n)
|
||||
{
|
||||
sumf += x[i];
|
||||
i += inc_x;
|
||||
}
|
||||
return(sumf);
|
||||
return (sumf);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||
|
||||
dot[0]=0.0;
|
||||
dot[1]=0.0;
|
||||
|
||||
#if !defined(__PPC__)
|
||||
CREAL(result) = 0.0 ;
|
||||
CIMAG(result) = 0.0 ;
|
||||
|
||||
#else
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
|
||||
#endif
|
||||
if ( n < 1 ) return(result);
|
||||
|
||||
inc_x2 = 2 * inc_x ;
|
||||
@@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||
i++ ;
|
||||
|
||||
}
|
||||
CREAL(result) = dot[0];
|
||||
#if !defined(__POWER__)
|
||||
CREAL(result) = dot[0];
|
||||
CIMAG(result) = dot[1];
|
||||
#else
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]);
|
||||
#endif
|
||||
return(result);
|
||||
|
||||
}
|
||||
|
||||
@@ -97,7 +97,7 @@ CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = dot.S
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
@@ -1,3 +1,187 @@
|
||||
include $(KERNELDIR)/KERNEL.ARMV8
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
endif
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
@@ -70,7 +70,7 @@ DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SDOTKERNEL = dot.S
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
DDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
|
||||
189
kernel/arm64/KERNEL.NEOVERSEN1
Normal file
189
kernel/arm64/KERNEL.NEOVERSEN1
Normal file
@@ -0,0 +1,189 @@
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
184
kernel/arm64/KERNEL.THUNDERX3T110
Normal file
184
kernel/arm64/KERNEL.THUNDERX3T110
Normal file
@@ -0,0 +1,184 @@
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
||||
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
|
||||
endif
|
||||
1
kernel/arm64/KERNEL.VORTEX
Normal file
1
kernel/arm64/KERNEL.VORTEX
Normal file
@@ -0,0 +1 @@
|
||||
include $(KERNELDIR)/KERNEL.ARMV8
|
||||
@@ -62,7 +62,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
y5 = a * x[5] + y[5];
|
||||
y6 = a * x[6] + y[6];
|
||||
y7 = a * x[7] + y[7];
|
||||
asm("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7));
|
||||
__asm__("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7));
|
||||
y[0] = y0;
|
||||
y[1] = y1;
|
||||
y[2] = y2;
|
||||
@@ -74,7 +74,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
|
||||
xx = (x + 4*128/sizeof(*x));
|
||||
yy = (y + 4*128/sizeof(*y));
|
||||
asm("":"+r"(yy)::"memory");
|
||||
__asm__("":"+r"(yy)::"memory");
|
||||
prefetch(xx);
|
||||
prefetch(yy);
|
||||
|
||||
|
||||
@@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add X, X, #128
|
||||
.endm
|
||||
|
||||
/*
|
||||
* No need to do software prefetches if the vector fits
|
||||
* into L1 cache
|
||||
*/
|
||||
.macro KERNEL_F16_L1CACHE
|
||||
ldp q4, q5, [X]
|
||||
ldp q16, q17, [Y]
|
||||
|
||||
ldp q6, q7, [X, #32]
|
||||
ldp q18, q19, [Y, #32]
|
||||
|
||||
fmla v16.2d, v4.2d, v0.d[0]
|
||||
fmla v17.2d, v5.2d, v0.d[0]
|
||||
|
||||
stp q16, q17, [Y]
|
||||
|
||||
ldp q20, q21, [X, #64]
|
||||
ldp q24, q25, [Y, #64]
|
||||
|
||||
fmla v18.2d, v6.2d, v0.d[0]
|
||||
fmla v19.2d, v7.2d, v0.d[0]
|
||||
|
||||
stp q18, q19, [Y, #32]
|
||||
|
||||
ldp q22, q23, [X, #96]
|
||||
ldp q26, q27, [Y, #96]
|
||||
|
||||
fmla v24.2d, v20.2d, v0.d[0]
|
||||
fmla v25.2d, v21.2d, v0.d[0]
|
||||
|
||||
stp q24, q25, [Y, #64]
|
||||
|
||||
fmla v26.2d, v22.2d, v0.d[0]
|
||||
fmla v27.2d, v23.2d, v0.d[0]
|
||||
|
||||
stp q26, q27, [Y, #96]
|
||||
|
||||
add Y, Y, #128
|
||||
add X, X, #128
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F32
|
||||
KERNEL_F16
|
||||
KERNEL_F16
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F32_L1CACHE
|
||||
KERNEL_F16_L1CACHE
|
||||
KERNEL_F16_L1CACHE
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
@@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
cmp I, xzr
|
||||
beq .Ldaxpy_kernel_F1
|
||||
|
||||
cmp N, #2048
|
||||
ble .Ldaxpy_kernel_F32_L1CACHE
|
||||
|
||||
.align 5
|
||||
.Ldaxpy_kernel_F32:
|
||||
|
||||
@@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
subs I, I, #1
|
||||
bne .Ldaxpy_kernel_F32
|
||||
b .Ldaxpy_kernel_F1
|
||||
|
||||
.align 5
|
||||
.Ldaxpy_kernel_F32_L1CACHE:
|
||||
|
||||
KERNEL_F32_L1CACHE
|
||||
|
||||
subs I, I, #1
|
||||
bne .Ldaxpy_kernel_F32_L1CACHE
|
||||
|
||||
.Ldaxpy_kernel_F1:
|
||||
|
||||
|
||||
@@ -81,14 +81,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro INIT_ZERO
|
||||
fmul v0.2d, v0.2d, betaV0
|
||||
fmul v1.2d, v1.2d, betaV0
|
||||
fmul v2.2d, v2.2d, betaV0
|
||||
fmul v3.2d, v3.2d, betaV0
|
||||
fmul v4.2d, v4.2d, betaV0
|
||||
fmul v5.2d, v5.2d, betaV0
|
||||
fmul v6.2d, v6.2d, betaV0
|
||||
fmul v7.2d, v7.2d, betaV0
|
||||
movi v0.2d, #0000000000000000
|
||||
movi v1.2d, #0000000000000000
|
||||
movi v2.2d, #0000000000000000
|
||||
movi v3.2d, #0000000000000000
|
||||
movi v4.2d, #0000000000000000
|
||||
movi v5.2d, #0000000000000000
|
||||
movi v6.2d, #0000000000000000
|
||||
movi v7.2d, #0000000000000000
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
|
||||
@@ -81,14 +81,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro INIT_ZERO
|
||||
fmul v0.4s, v0.4s, betaV0
|
||||
fmul v1.4s, v1.4s, betaV0
|
||||
fmul v2.4s, v2.4s, betaV0
|
||||
fmul v3.4s, v3.4s, betaV0
|
||||
fmul v4.4s, v4.4s, betaV0
|
||||
fmul v5.4s, v5.4s, betaV0
|
||||
fmul v6.4s, v6.4s, betaV0
|
||||
fmul v7.4s, v7.4s, betaV0
|
||||
movi v0.4s, #0x0
|
||||
movi v1.4s, #0x0
|
||||
movi v2.4s, #0x0
|
||||
movi v3.4s, #0x0
|
||||
movi v4.4s, #0x0
|
||||
movi v5.4s, #0x0
|
||||
movi v6.4s, #0x0
|
||||
movi v7.4s, #0x0
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
|
||||
2299
kernel/arm64/sgemm_kernel_8x8_cortexa53.S
Normal file
2299
kernel/arm64/sgemm_kernel_8x8_cortexa53.S
Normal file
File diff suppressed because it is too large
Load Diff
562
kernel/arm64/sgemm_ncopy_8.S
Normal file
562
kernel/arm64/sgemm_ncopy_8.S
Normal file
@@ -0,0 +1,562 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define A00 x2
|
||||
#define LDA x3
|
||||
#define B00 x4
|
||||
|
||||
#define A01 x5
|
||||
#define A02 x6
|
||||
#define A03 x7
|
||||
#define A04 x8
|
||||
#define A05 x9
|
||||
#define A06 x10
|
||||
#define A07 x11
|
||||
#define A08 x12
|
||||
|
||||
#define I x13
|
||||
#define J x14
|
||||
#define K x15
|
||||
|
||||
#define TEMP1 x16
|
||||
#define TEMP2 x17
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro COPY4x8
|
||||
ldr q0, [A01], #16
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v10.s[0], v0.s[1]
|
||||
ins v12.s[0], v0.s[2]
|
||||
ins v14.s[0], v0.s[3]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v10.s[1], v1.s[1]
|
||||
ins v12.s[1], v1.s[2]
|
||||
ins v14.s[1], v1.s[3]
|
||||
|
||||
ldr q2, [A03], #16
|
||||
ldr q3, [A04], #16
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v10.s[2], v2.s[1]
|
||||
ins v12.s[2], v2.s[2]
|
||||
ins v14.s[2], v2.s[3]
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v10.s[3], v3.s[1]
|
||||
ins v12.s[3], v3.s[2]
|
||||
ins v14.s[3], v3.s[3]
|
||||
|
||||
ldr q4, [A05], #16
|
||||
ldr q5, [A06], #16
|
||||
ins v9.s[0], v4.s[0]
|
||||
ins v11.s[0], v4.s[1]
|
||||
ins v13.s[0], v4.s[2]
|
||||
ins v15.s[0], v4.s[3]
|
||||
ins v9.s[1], v5.s[0]
|
||||
ins v11.s[1], v5.s[1]
|
||||
ins v13.s[1], v5.s[2]
|
||||
ins v15.s[1], v5.s[3]
|
||||
|
||||
ldr q6, [A07], #16
|
||||
ldr q7, [A08], #16
|
||||
ins v9.s[2], v6.s[0]
|
||||
ins v11.s[2], v6.s[1]
|
||||
ins v13.s[2], v6.s[2]
|
||||
ins v15.s[2], v6.s[3]
|
||||
ins v9.s[3], v7.s[0]
|
||||
ins v11.s[3], v7.s[1]
|
||||
ins v13.s[3], v7.s[2]
|
||||
ins v15.s[3], v7.s[3]
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x8
|
||||
ldr d0, [A01], #8
|
||||
ldr d1, [A02], #8
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v10.s[0], v0.s[1]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v10.s[1], v1.s[1]
|
||||
|
||||
ldr d2, [A03], #8
|
||||
ldr d3, [A04], #8
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v10.s[2], v2.s[1]
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v10.s[3], v3.s[1]
|
||||
|
||||
ldr d4, [A05], #8
|
||||
ldr d5, [A06], #8
|
||||
ins v9.s[0], v4.s[0]
|
||||
ins v11.s[0], v4.s[1]
|
||||
ins v9.s[1], v5.s[0]
|
||||
ins v11.s[1], v5.s[1]
|
||||
|
||||
ldr d6, [A07], #8
|
||||
ldr d7, [A08], #8
|
||||
ins v9.s[2], v6.s[0]
|
||||
ins v11.s[2], v6.s[1]
|
||||
ins v9.s[3], v7.s[0]
|
||||
ins v11.s[3], v7.s[1]
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
|
||||
.endm
|
||||
|
||||
.macro COPY1x8
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v8.s[1], v1.s[0]
|
||||
|
||||
ldr s2, [A03], #4
|
||||
ldr s3, [A04], #4
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v8.s[3], v3.s[0]
|
||||
|
||||
ldr s4, [A05], #4
|
||||
ldr s5, [A06], #4
|
||||
ins v9.s[0], v4.s[0]
|
||||
ins v9.s[1], v5.s[0]
|
||||
|
||||
ldr s6, [A07], #4
|
||||
ldr s7, [A08], #4
|
||||
ins v9.s[2], v6.s[0]
|
||||
ins v9.s[3], v7.s[0]
|
||||
|
||||
st1 {v8.4s, v9.4s}, [B00], #32
|
||||
.endm
|
||||
|
||||
.macro COPY4x4
|
||||
ldr q0, [A01], #16
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v10.s[0], v0.s[2]
|
||||
ins v11.s[0], v0.s[3]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
ins v10.s[1], v1.s[2]
|
||||
ins v11.s[1], v1.s[3]
|
||||
|
||||
ldr q2, [A03], #16
|
||||
ldr q3, [A04], #16
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v9.s[2], v2.s[1]
|
||||
ins v10.s[2], v2.s[2]
|
||||
ins v11.s[2], v2.s[3]
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v9.s[3], v3.s[1]
|
||||
ins v10.s[3], v3.s[2]
|
||||
ins v11.s[3], v3.s[3]
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x4
|
||||
ldr d0, [A01], #8
|
||||
ldr d1, [A02], #8
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
|
||||
ldr d2, [A03], #8
|
||||
ldr d3, [A04], #8
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v9.s[2], v2.s[1]
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v9.s[3], v3.s[1]
|
||||
|
||||
st1 {v8.4s, v9.4s}, [B00], #32
|
||||
.endm
|
||||
|
||||
.macro COPY1x4
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v8.s[1], v1.s[0]
|
||||
|
||||
ldr s2, [A03], #4
|
||||
ldr s3, [A04], #4
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v8.s[3], v3.s[0]
|
||||
|
||||
st1 {v8.4s}, [B00], #16
|
||||
.endm
|
||||
|
||||
.macro COPY4x2
|
||||
ldr q0, [A01], #16
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v10.s[0], v0.s[2]
|
||||
ins v11.s[0], v0.s[3]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
ins v10.s[1], v1.s[2]
|
||||
ins v11.s[1], v1.s[3]
|
||||
|
||||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32
|
||||
.endm
|
||||
|
||||
.macro COPY2x2
|
||||
ldr d0, [A01], #8
|
||||
ldr d1, [A02], #8
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
|
||||
st1 {v8.2s, v9.2s}, [B00], #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x2
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v8.s[1], v1.s[0]
|
||||
|
||||
st1 {v8.2s}, [B00], #8
|
||||
.endm
|
||||
|
||||
.macro COPY1x1
|
||||
ldr s0, [A01], #4
|
||||
str s0, [B00], #4
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||
|
||||
.Lsgemm_ncopy_L8_BEGIN:
|
||||
|
||||
asr J, N, #3 // J = N / 8
|
||||
cmp J, #0
|
||||
ble .Lsgemm_ncopy_L4_BEGIN
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L8_M4_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A05, A04, LDA
|
||||
add A06, A05, LDA
|
||||
add A07, A06, LDA
|
||||
add A08, A07, LDA
|
||||
add A00, A08, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L8_M4_40
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A01
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_1:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_1
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A02
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_2:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_2
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A03
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_3:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_3
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A04
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_4:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_4
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A05
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_5:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_5
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A06
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_6:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_6
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A07
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_7:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_7
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A08
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_8:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_8
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L8_M4_20:
|
||||
|
||||
COPY4x8
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsgemm_ncopy_L8_M4_20
|
||||
|
||||
.Lsgemm_ncopy_L8_M4_40:
|
||||
|
||||
and I, M, #2
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L8_M4_60
|
||||
|
||||
COPY2x8
|
||||
|
||||
.Lsgemm_ncopy_L8_M4_60:
|
||||
|
||||
and I, M, #1
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L8_M4_END
|
||||
|
||||
COPY1x8
|
||||
|
||||
.Lsgemm_ncopy_L8_M4_END:
|
||||
|
||||
subs J , J, #1 // j--
|
||||
bne .Lsgemm_ncopy_L8_M4_BEGIN
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_ncopy_L4_BEGIN:
|
||||
|
||||
tst N, #7
|
||||
ble .Lsgemm_ncopy_L999
|
||||
|
||||
tst N, #4
|
||||
ble .Lsgemm_ncopy_L2_BEGIN
|
||||
|
||||
.Lsgemm_ncopy_L4_M4_BEGIN:
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A00, A04, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L4_M4_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L4_M4_20:
|
||||
|
||||
COPY4x4
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsgemm_ncopy_L4_M4_20
|
||||
|
||||
.Lsgemm_ncopy_L4_M4_40:
|
||||
|
||||
and I, M, #2
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L4_M4_60
|
||||
|
||||
COPY2x4
|
||||
|
||||
.Lsgemm_ncopy_L4_M4_60:
|
||||
|
||||
and I, M, #1
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L4_M4_END
|
||||
|
||||
COPY1x4
|
||||
|
||||
.Lsgemm_ncopy_L4_M4_END:
|
||||
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_ncopy_L2_BEGIN:
|
||||
|
||||
tst N, #2
|
||||
ble .Lsgemm_ncopy_L1_BEGIN
|
||||
|
||||
.Lsgemm_ncopy_L2_M4_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A00, A02, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L2_M4_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L2_M4_20:
|
||||
|
||||
COPY4x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_ncopy_L2_M4_20
|
||||
|
||||
|
||||
.Lsgemm_ncopy_L2_M4_40:
|
||||
|
||||
and I, M, #2
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L2_M4_60
|
||||
|
||||
COPY2x2
|
||||
|
||||
.Lsgemm_ncopy_L2_M4_60:
|
||||
|
||||
and I, M, #1
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L2_M4_END
|
||||
|
||||
COPY1x2
|
||||
|
||||
.Lsgemm_ncopy_L2_M4_END:
|
||||
|
||||
.Lsgemm_ncopy_L1_BEGIN:
|
||||
|
||||
tst N, #1
|
||||
ble .Lsgemm_ncopy_L999
|
||||
|
||||
.Lsgemm_ncopy_L1_M1_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
|
||||
mov I, M
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L1_M1_END
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L1_M1_20:
|
||||
|
||||
COPY1x1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsgemm_ncopy_L1_M1_20
|
||||
|
||||
.Lsgemm_ncopy_L1_M1_END:
|
||||
|
||||
.Lsgemm_ncopy_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
707
kernel/arm64/sgemm_tcopy_8.S
Normal file
707
kernel/arm64/sgemm_tcopy_8.S
Normal file
@@ -0,0 +1,707 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define A x2
|
||||
#define LDA x3
|
||||
#define B x4
|
||||
|
||||
#define M8 x5
|
||||
|
||||
#define A01 x6
|
||||
#define A02 x7
|
||||
#define A03 x8
|
||||
#define A04 x9
|
||||
#define A05 x10
|
||||
#define A06 x11
|
||||
#define A07 x12
|
||||
#define A08 x13
|
||||
|
||||
#define B01 x14
|
||||
#define B02 x15
|
||||
#define B03 x16
|
||||
#define B04 x17
|
||||
#define B00 x22
|
||||
|
||||
|
||||
#define I x18
|
||||
#define J x19
|
||||
|
||||
#define TEMP1 x20
|
||||
|
||||
#define A_PREFETCH 256
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY8x8
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
add A03, A03, #32
|
||||
add A04, A04, #32
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ldp q8, q9, [A05]
|
||||
ldp q10, q11, [A06]
|
||||
add A05, A05, #32
|
||||
add A06, A06, #32
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ldp q12, q13, [A07]
|
||||
ldp q14, q15, [A08]
|
||||
add A07, A07, #32
|
||||
add A08, A08, #32
|
||||
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY4x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
ldr q2, [A03]
|
||||
ldr q3, [A04]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
add A03, A03, #16
|
||||
add A04, A04, #16
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldr q4, [A05]
|
||||
ldr q5, [A06]
|
||||
ldr q6, [A07]
|
||||
ldr q7, [A08]
|
||||
|
||||
add A05, A05, #16
|
||||
add A06, A06, #16
|
||||
add A07, A07, #16
|
||||
add A08, A08, #16
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
ldr d2, [A03]
|
||||
ldr d3, [A04]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
add A03, A03, #8
|
||||
add A04, A04, #8
|
||||
|
||||
stp d0, d1, [B02]
|
||||
add B02, B02, #16
|
||||
stp d2, d3, [B02]
|
||||
add B02, B02, #16
|
||||
|
||||
ldr d4, [A05]
|
||||
ldr d5, [A06]
|
||||
ldr d6, [A07]
|
||||
ldr d7, [A08]
|
||||
|
||||
add A05, A05, #8
|
||||
add A06, A06, #8
|
||||
add A07, A07, #8
|
||||
add A08, A08, #8
|
||||
|
||||
stp d4, d5, [B02]
|
||||
add B02, B02, #16
|
||||
stp d6, d7, [B02]
|
||||
add B02, B02, #16
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY1x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
ldr s2, [A03]
|
||||
ldr s3, [A04]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
add A03, A03, #4
|
||||
add A04, A04, #4
|
||||
|
||||
stp s0, s1, [B03]
|
||||
add B03, B03, #8
|
||||
stp s2, s3, [B03]
|
||||
add B03, B03, #8
|
||||
|
||||
ldr s4, [A05]
|
||||
ldr s5, [A06]
|
||||
ldr s6, [A07]
|
||||
ldr s7, [A08]
|
||||
|
||||
ldr d4, [A05], #8
|
||||
ldr d5, [A06], #8
|
||||
ldr d6, [A07], #8
|
||||
ldr d7, [A08], #8
|
||||
|
||||
stp s4, s5, [B03]
|
||||
add B03, B03, #8
|
||||
stp s6, s7, [B03]
|
||||
add B03, B03, #8
|
||||
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY8x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
add A03, A03, #32
|
||||
add A04, A04, #32
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY4x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
ldr q2, [A03]
|
||||
ldr q3, [A04]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
add A03, A03, #16
|
||||
add A04, A04, #16
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
ldr d2, [A03]
|
||||
ldr d3, [A04]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
add A03, A03, #8
|
||||
add A04, A04, #8
|
||||
|
||||
stp d0, d1, [B02]
|
||||
add B02, B02, #16
|
||||
stp d2, d3, [B02]
|
||||
|
||||
add B02, B02, #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
ldr s2, [A03]
|
||||
ldr s3, [A04]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
add A03, A03, #4
|
||||
add A04, A04, #4
|
||||
|
||||
stp s0, s1, [B03]
|
||||
add B03, B03, #8
|
||||
stp s2, s3, [B03]
|
||||
add B03, B03, #8
|
||||
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY8x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s}, [A01]
|
||||
ld1 {v2.4s, v3.4s}, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY4x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
|
||||
stp q0, q1, [B01]
|
||||
add B01, B01, #32
|
||||
.endm
|
||||
|
||||
.macro COPY2x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
|
||||
stp d0, d1, [B02]
|
||||
add B02, B02, #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
|
||||
stp s0, s1, [B03]
|
||||
|
||||
add B03, B03, #8
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY8x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
add A01, A01, #32
|
||||
stp q0, q1, [B00]
|
||||
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY4x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
add A01, A01, #16
|
||||
str q0, [B01]
|
||||
|
||||
add B01, B01, #16
|
||||
.endm
|
||||
|
||||
.macro COPY2x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
add A01, A01, #8
|
||||
str d0, [B02]
|
||||
|
||||
add B02, B02, #8
|
||||
.endm
|
||||
|
||||
.macro COPY1x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
add A01, A01, #4
|
||||
str s0, [B03]
|
||||
|
||||
add B03, B03, #4
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||
|
||||
lsl TEMP1, M, #2 // TEMP1 = M * SIZE
|
||||
|
||||
and B01 , N , #-8
|
||||
and B02 , N , #-4
|
||||
and B03 , N , #-2
|
||||
|
||||
mul B01, B01, TEMP1
|
||||
mul B02, B02, TEMP1
|
||||
mul B03, B03, TEMP1
|
||||
|
||||
add B01 , B01, B
|
||||
add B02 , B02, B
|
||||
add B03 , B03, B
|
||||
|
||||
lsl M8, M, #5 // M8 = M * 8 * SIZE
|
||||
|
||||
.Lsgemm_tcopy_L8_BEGIN:
|
||||
|
||||
asr J, M, #3 // J = M / 8
|
||||
cmp J, #0
|
||||
ble .Lsgemm_tcopy_L4_BEGIN
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_M8_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A05, A04, LDA
|
||||
add A06, A05, LDA
|
||||
add A07, A06, LDA
|
||||
add A08, A07, LDA
|
||||
add A, A08, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #256 // B = B + 8 * 8 * SIZE
|
||||
|
||||
asr I, N, #3 // I = N / 8
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L8_M8_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_M8_20:
|
||||
|
||||
COPY8x8
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L8_M8_20
|
||||
|
||||
.Lsgemm_tcopy_L8_M8_40:
|
||||
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L8_M8_60
|
||||
|
||||
COPY4x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M8_60:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L8_M8_80
|
||||
|
||||
COPY2x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M8_80:
|
||||
|
||||
tst N, #1
|
||||
ble .Lsgemm_tcopy_L8_M8_END
|
||||
|
||||
COPY1x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M8_END:
|
||||
|
||||
subs J, J, #1 // j--
|
||||
bne .Lsgemm_tcopy_L8_M8_BEGIN
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L4_BEGIN:
|
||||
|
||||
tst M, #7
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
tst M, #4
|
||||
ble .Lsgemm_tcopy_L2_BEGIN
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A, A04, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #128 // B = B + 4 * 8 * SIZE
|
||||
|
||||
asr I, N, #3 // I = N / 8
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L4_M8_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L4_M8_20:
|
||||
|
||||
COPY8x4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L4_M8_20
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_40:
|
||||
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L4_M8_60
|
||||
|
||||
COPY4x4
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_60:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L4_M8_80
|
||||
|
||||
COPY2x4
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_80:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L4_M8_END
|
||||
|
||||
COPY1x4
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_END:
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L2_BEGIN:
|
||||
|
||||
tst M, #3
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
tst M, #2
|
||||
ble .Lsgemm_tcopy_L1_BEGIN
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A, A02, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #64 // B = B + 2 * 8 * SIZE
|
||||
|
||||
asr I, N, #3 // I = N / 8
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L2_M8_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L2_M8_20:
|
||||
|
||||
COPY8x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L2_M8_20
|
||||
|
||||
.Lsgemm_tcopy_L2_M8_40:
|
||||
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L2_M8_60
|
||||
|
||||
COPY4x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M8_60:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L2_M8_80
|
||||
|
||||
COPY2x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M8_80:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L2_M8_END
|
||||
|
||||
COPY1x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M8_END:
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L1_BEGIN:
|
||||
|
||||
tst M, #1
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_BEGIN:
|
||||
|
||||
mov A01, A // A01 = A
|
||||
mov B00, B
|
||||
|
||||
asr I, N, #3 // I = M / 8
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L1_M8_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L1_M8_20:
|
||||
|
||||
COPY8x1
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L1_M8_20
|
||||
|
||||
.Lsgemm_tcopy_L1_M8_40:
|
||||
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L1_M8_60
|
||||
|
||||
COPY4x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M8_60:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L1_M8_80
|
||||
|
||||
COPY2x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M8_80:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L1_M8_END
|
||||
|
||||
COPY1x1
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L1_M8_END:
|
||||
|
||||
.Lsgemm_tcopy_L999:
|
||||
|
||||
mov x0, #0 // set return value
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
2823
kernel/arm64/strmm_kernel_8x8_cortexa53.S
Normal file
2823
kernel/arm64/strmm_kernel_8x8_cortexa53.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -137,10 +137,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
#if defined(USE_MIN)
|
||||
fmin v1.2d, v1.2d, v3.2d
|
||||
fminp MAXF, v1.2d
|
||||
fminp TMPF, v1.2d
|
||||
#else
|
||||
fmax v1.2d, v1.2d, v3.2d
|
||||
fmaxp MAXF, v1.2d
|
||||
fmaxp TMPF, v1.2d
|
||||
#endif
|
||||
#endif
|
||||
fcmp MAXF, TMPF
|
||||
|
||||
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include "../simd/intrin.h"
|
||||
#if defined(DSDOT)
|
||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
@@ -47,27 +47,59 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
int n1 = n & -4;
|
||||
|
||||
while(i < n1)
|
||||
int n1 = n & -4;
|
||||
#if V_SIMD && !defined(DSDOT)
|
||||
const int vstep = v_nlanes_f32;
|
||||
const int unrollx4 = n & (-vstep * 4);
|
||||
const int unrollx = n & -vstep;
|
||||
v_f32 vsum0 = v_zero_f32();
|
||||
v_f32 vsum1 = v_zero_f32();
|
||||
v_f32 vsum2 = v_zero_f32();
|
||||
v_f32 vsum3 = v_zero_f32();
|
||||
while(i < unrollx4)
|
||||
{
|
||||
vsum0 = v_muladd_f32(
|
||||
v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0
|
||||
);
|
||||
vsum1 = v_muladd_f32(
|
||||
v_loadu_f32(x + i + vstep), v_loadu_f32(y + i + vstep), vsum1
|
||||
);
|
||||
vsum2 = v_muladd_f32(
|
||||
v_loadu_f32(x + i + vstep*2), v_loadu_f32(y + i + vstep*2), vsum2
|
||||
);
|
||||
vsum3 = v_muladd_f32(
|
||||
v_loadu_f32(x + i + vstep*3), v_loadu_f32(y + i + vstep*3), vsum3
|
||||
);
|
||||
i += vstep*4;
|
||||
}
|
||||
vsum0 = v_add_f32(
|
||||
v_add_f32(vsum0, vsum1), v_add_f32(vsum2 , vsum3)
|
||||
);
|
||||
while(i < unrollx)
|
||||
{
|
||||
vsum0 = v_muladd_f32(
|
||||
v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0
|
||||
);
|
||||
i += vstep;
|
||||
}
|
||||
dot = v_sum_f32(vsum0);
|
||||
#elif defined(DSDOT)
|
||||
for (; i < n1; i += 4)
|
||||
{
|
||||
|
||||
#if defined(DSDOT)
|
||||
dot += (double) y[i] * (double) x[i]
|
||||
+ (double) y[i+1] * (double) x[i+1]
|
||||
+ (double) y[i+2] * (double) x[i+2]
|
||||
+ (double) y[i+3] * (double) x[i+3] ;
|
||||
}
|
||||
#else
|
||||
for (; i < n1; i += 4)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3] ;
|
||||
#endif
|
||||
i+=4 ;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
||||
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
|
||||
IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
|
||||
FLOAT *c, BLASLONG ldc){
|
||||
|
||||
|
||||
|
||||
@@ -39,24 +39,24 @@
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
|
||||
FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
|
||||
IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
|
||||
|
||||
FLOAT *boffset;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
IFLOAT *boffset;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
@@ -39,10 +39,10 @@
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
FLOAT *a_offset, *a_offset1, *a_offset2;
|
||||
FLOAT *b_offset;
|
||||
IFLOAT *a_offset, *a_offset1, *a_offset2;
|
||||
IFLOAT *b_offset;
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
@@ -39,30 +39,30 @@
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
|
||||
FLOAT *boffset;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
IFLOAT *boffset;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
|
||||
|
||||
aoffset = a;
|
||||
|
||||
@@ -39,22 +39,22 @@
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2;
|
||||
FLOAT *boffset;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2;
|
||||
IFLOAT *boffset;
|
||||
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
@@ -39,11 +39,11 @@
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *a_offset, *a_offset1, *a_offset2;
|
||||
FLOAT *b_offset, *b_offset1, *b_offset2;
|
||||
IFLOAT *a_offset, *a_offset1, *a_offset2;
|
||||
IFLOAT *b_offset, *b_offset1, *b_offset2;
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
@@ -39,32 +39,32 @@
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
|
||||
FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
||||
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
||||
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
@@ -1,13 +1,32 @@
|
||||
#include "common.h"
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION)
|
||||
static float
|
||||
bfloat16tof32 (bfloat16 f16)
|
||||
{
|
||||
float result = 0;
|
||||
unsigned short* q = (unsigned short*)(&result);
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
q[0] = f16;
|
||||
#else
|
||||
q[1] = f16;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
#define BF16TOF32(x) (bfloat16tof32(x))
|
||||
#else
|
||||
#define BF16TOF32(x) x
|
||||
#endif
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
|
||||
FLOAT *C0,*C1;
|
||||
IFLOAT *ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3;
|
||||
IFLOAT load0,load1,load2,load3,load4,load5,load6,load7;
|
||||
for (j=0; j<bn/2; j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
@@ -24,36 +43,36 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
res1 = res1+BF16TOF32(load2)*BF16TOF32(load1);
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
res2 = res2+BF16TOF32(load0)*BF16TOF32(load3);
|
||||
res3 = res3+BF16TOF32(load2)*BF16TOF32(load3);
|
||||
load4 = ptrba[2*1+0];
|
||||
load5 = ptrbb[2*1+0];
|
||||
res0 = res0+load4*load5;
|
||||
res0 = res0+BF16TOF32(load4)*BF16TOF32(load5);
|
||||
load6 = ptrba[2*1+1];
|
||||
res1 = res1+load6*load5;
|
||||
res1 = res1+BF16TOF32(load6)*BF16TOF32(load5);
|
||||
load7 = ptrbb[2*1+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
res2 = res2+BF16TOF32(load4)*BF16TOF32(load7);
|
||||
res3 = res3+BF16TOF32(load6)*BF16TOF32(load7);
|
||||
load0 = ptrba[2*2+0];
|
||||
load1 = ptrbb[2*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||
load2 = ptrba[2*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
res1 = res1+BF16TOF32(load2)*BF16TOF32(load1);
|
||||
load3 = ptrbb[2*2+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
res2 = res2+BF16TOF32(load0)*BF16TOF32(load3);
|
||||
res3 = res3+BF16TOF32(load2)*BF16TOF32(load3);
|
||||
load4 = ptrba[2*3+0];
|
||||
load5 = ptrbb[2*3+0];
|
||||
res0 = res0+load4*load5;
|
||||
res0 = res0+BF16TOF32(load4)*BF16TOF32(load5);
|
||||
load6 = ptrba[2*3+1];
|
||||
res1 = res1+load6*load5;
|
||||
res1 = res1+BF16TOF32(load6)*BF16TOF32(load5);
|
||||
load7 = ptrbb[2*3+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
res2 = res2+BF16TOF32(load4)*BF16TOF32(load7);
|
||||
res3 = res3+BF16TOF32(load6)*BF16TOF32(load7);
|
||||
ptrba = ptrba+8;
|
||||
ptrbb = ptrbb+8;
|
||||
}
|
||||
@@ -61,12 +80,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
res1 = res1+BF16TOF32(load2)*BF16TOF32(load1);
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
res2 = res2+BF16TOF32(load0)*BF16TOF32(load3);
|
||||
res3 = res3+BF16TOF32(load2)*BF16TOF32(load3);
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
@@ -90,9 +109,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||
load2 = ptrbb[2*0+1];
|
||||
res1 = res1+load0*load2;
|
||||
res1 = res1+BF16TOF32(load0)*BF16TOF32(load2);
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
@@ -121,9 +140,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
res1 = res1+BF16TOF32(load2)*BF16TOF32(load1);
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
@@ -141,7 +160,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
|
||||
1
kernel/mips/KERNEL.MIPS24K
Normal file
1
kernel/mips/KERNEL.MIPS24K
Normal file
@@ -0,0 +1 @@
|
||||
include $(KERNELDIR)/KERNEL.P5600
|
||||
160
kernel/mips64/KERNEL.generic
Normal file
160
kernel/mips64/KERNEL.generic
Normal file
@@ -0,0 +1,160 @@
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Pure C for other kernels
|
||||
SAMAXKERNEL = ../mips/amax.c
|
||||
DAMAXKERNEL = ../mips/amax.c
|
||||
CAMAXKERNEL = ../mips/zamax.c
|
||||
ZAMAXKERNEL = ../mips/zamax.c
|
||||
|
||||
SAMINKERNEL = ../mips/amin.c
|
||||
DAMINKERNEL = ../mips/amin.c
|
||||
CAMINKERNEL = ../mips/zamin.c
|
||||
ZAMINKERNEL = ../mips/zamin.c
|
||||
|
||||
SMAXKERNEL = ../mips/max.c
|
||||
DMAXKERNEL = ../mips/max.c
|
||||
|
||||
SMINKERNEL = ../mips/min.c
|
||||
DMINKERNEL = ../mips/min.c
|
||||
|
||||
ISAMAXKERNEL = ../mips/iamax.c
|
||||
IDAMAXKERNEL = ../mips/iamax.c
|
||||
ICAMAXKERNEL = ../mips/izamax.c
|
||||
IZAMAXKERNEL = ../mips/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../mips/iamin.c
|
||||
IDAMINKERNEL = ../mips/iamin.c
|
||||
ICAMINKERNEL = ../mips/izamin.c
|
||||
IZAMINKERNEL = ../mips/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../mips/imax.c
|
||||
IDMAXKERNEL = ../mips/imax.c
|
||||
|
||||
ISMINKERNEL = ../mips/imin.c
|
||||
IDMINKERNEL = ../mips/imin.c
|
||||
|
||||
SASUMKERNEL = ../mips/asum.c
|
||||
DASUMKERNEL = ../mips/asum.c
|
||||
CASUMKERNEL = ../mips/zasum.c
|
||||
ZASUMKERNEL = ../mips/zasum.c
|
||||
|
||||
SSUMKERNEL = ../mips/sum.c
|
||||
DSUMKERNEL = ../mips/sum.c
|
||||
CSUMKERNEL = ../mips/zsum.c
|
||||
ZSUMKERNEL = ../mips/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../mips/axpy.c
|
||||
DAXPYKERNEL = ../mips/axpy.c
|
||||
CAXPYKERNEL = ../mips/zaxpy.c
|
||||
ZAXPYKERNEL = ../mips/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../mips/copy.c
|
||||
DCOPYKERNEL = ../mips/copy.c
|
||||
CCOPYKERNEL = ../mips/zcopy.c
|
||||
ZCOPYKERNEL = ../mips/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../mips/dot.c
|
||||
DDOTKERNEL = ../mips/dot.c
|
||||
CDOTKERNEL = ../mips/zdot.c
|
||||
ZDOTKERNEL = ../mips/zdot.c
|
||||
|
||||
SNRM2KERNEL = ../mips/nrm2.c
|
||||
DNRM2KERNEL = ../mips/nrm2.c
|
||||
CNRM2KERNEL = ../mips/znrm2.c
|
||||
ZNRM2KERNEL = ../mips/znrm2.c
|
||||
|
||||
SROTKERNEL = ../mips/rot.c
|
||||
DROTKERNEL = ../mips/rot.c
|
||||
CROTKERNEL = ../mips/zrot.c
|
||||
ZROTKERNEL = ../mips/zrot.c
|
||||
|
||||
SSCALKERNEL = ../mips/scal.c
|
||||
DSCALKERNEL = ../mips/scal.c
|
||||
CSCALKERNEL = ../mips/zscal.c
|
||||
ZSCALKERNEL = ../mips/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../mips/swap.c
|
||||
DSWAPKERNEL = ../mips/swap.c
|
||||
CSWAPKERNEL = ../mips/zswap.c
|
||||
ZSWAPKERNEL = ../mips/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../mips/gemv_n.c
|
||||
DGEMVNKERNEL = ../mips/gemv_n.c
|
||||
CGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../mips/gemv_t.c
|
||||
DGEMVTKERNEL = ../mips/gemv_t.c
|
||||
CGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
|
||||
SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
225
kernel/power/KERNEL.POWER10
Normal file
225
kernel/power/KERNEL.POWER10
Normal file
@@ -0,0 +1,225 @@
|
||||
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||
include $(KERNELDIR)/KERNEL.POWER8
|
||||
else
|
||||
|
||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
SBGEMM_BETA = ../generic/gemm_beta.c
|
||||
SBGEMMKERNEL = sbgemm_kernel_power10.c
|
||||
SBGEMMINCOPY = sbgemm_ncopy_16_power10.c
|
||||
SBGEMMITCOPY = sbgemm_tcopy_16_power10.c
|
||||
SBGEMMONCOPY = sbgemm_ncopy_8_power10.c
|
||||
SBGEMMOTCOPY = sbgemm_tcopy_8_power10.c
|
||||
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMKERNEL = sgemm_kernel_power10.c
|
||||
DTRMMKERNEL = dgemm_kernel_power10.c
|
||||
CTRMMKERNEL = cgemm_kernel_power10.S
|
||||
ZTRMMKERNEL = zgemm_kernel_power10.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_power10.c
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_power10.c
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_power10.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_power10.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
|
||||
#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
|
||||
#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
|
||||
|
||||
#Pure C for other kernels
|
||||
#SAMAXKERNEL = ../arm/amax.c
|
||||
#DAMAXKERNEL = ../arm/amax.c
|
||||
#CAMAXKERNEL = ../arm/zamax.c
|
||||
#ZAMAXKERNEL = ../arm/zamax.c
|
||||
#
|
||||
#SAMINKERNEL = ../arm/amin.c
|
||||
#DAMINKERNEL = ../arm/amin.c
|
||||
#CAMINKERNEL = ../arm/zamin.c
|
||||
#ZAMINKERNEL = ../arm/zamin.c
|
||||
#
|
||||
#SMAXKERNEL = ../arm/max.c
|
||||
#DMAXKERNEL = ../arm/max.c
|
||||
#
|
||||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ISAMAXKERNEL = isamax_power9.S
|
||||
else
|
||||
ISAMAXKERNEL = isamax.c
|
||||
endif
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ICAMAXKERNEL = icamax_power9.S
|
||||
else
|
||||
ICAMAXKERNEL = icamax.c
|
||||
endif
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ISAMINKERNEL = isamin_power9.S
|
||||
else
|
||||
ISAMINKERNEL = isamin.c
|
||||
endif
|
||||
IDAMINKERNEL = idamin.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ICAMINKERNEL = icamin_power9.S
|
||||
else
|
||||
ICAMINKERNEL = icamin.c
|
||||
endif
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
#IDMAXKERNEL = ../arm/imax.c
|
||||
#
|
||||
#ISMINKERNEL = ../arm/imin.c
|
||||
#IDMINKERNEL = ../arm/imin.c
|
||||
#
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy_power10.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CAXPYKERNEL = caxpy_power9.S
|
||||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
ZAXPYKERNEL = zaxpy_power10.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy_power10.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy_power10.c
|
||||
#
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
DSDOTKERNEL = sdot.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CDOTKERNEL = cdot_power9.S
|
||||
else
|
||||
CDOTKERNEL = cdot.c
|
||||
endif
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
#
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
CROTKERNEL = crot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
#
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
#
|
||||
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n_power10.c
|
||||
CGEMVNKERNEL = cgemv_n.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
#
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
DGEMVTKERNEL = dgemv_t_power10.c
|
||||
CGEMVTKERNEL = cgemv_t.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
#SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
#ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
#ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
||||
endif
|
||||
@@ -1,3 +1,44 @@
|
||||
# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM
|
||||
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||
SGEMMKERNEL = gemm_kernel_power6.S
|
||||
SGEMMINCOPY =
|
||||
SGEMMITCOPY =
|
||||
SGEMMONCOPY = gemm_ncopy_4.S
|
||||
SGEMMOTCOPY = gemm_tcopy_4.S
|
||||
SGEMMINCOPYOBJ =
|
||||
SGEMMITCOPYOBJ =
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMKERNEL = gemm_kernel_power6.S
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
DGEMMONCOPY = gemm_ncopy_4.S
|
||||
DGEMMOTCOPY = gemm_tcopy_4.S
|
||||
DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = zgemm_kernel_power6.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_power6.S
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
else
|
||||
|
||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
@@ -12,7 +53,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
@@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||
DTRSMKERNEL_LN = trsm_kernel_power6_LN.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_power6_LT.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_power6_LT.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_power6_RT.S
|
||||
else
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
@@ -153,6 +202,10 @@ ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
#
|
||||
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
else
|
||||
ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CAXPYKERNEL = caxpy_power8.S
|
||||
@@ -162,6 +215,7 @@ endif
|
||||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
endif
|
||||
#
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
@@ -232,3 +286,10 @@ QCABS_KERNEL = ../generic/cabs.c
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
||||
ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2)
|
||||
IDAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
IZAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
endif
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||
include $(KERNELDIR)/KERNEL.POWER8
|
||||
else
|
||||
|
||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
@@ -12,7 +16,7 @@ SGEMMKERNEL = sgemm_kernel_power9.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
@@ -206,3 +210,5 @@ QCABS_KERNEL = ../generic/cabs.c
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
||||
endif
|
||||
|
||||
@@ -20,8 +20,10 @@ ZAXPYKERNEL = zaxpy_ppc440.S
|
||||
|
||||
SDOTKERNEL = dot_ppc440.S
|
||||
DDOTKERNEL = dot_ppc440.S
|
||||
CDOTKERNEL = zdot_ppc440.S
|
||||
ZDOTKERNEL = zdot_ppc440.S
|
||||
#CDOTKERNEL = zdot_ppc440.S
|
||||
#ZDOTKERNEL = zdot_ppc440.S
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
|
||||
ISAMAXKERNEL = iamax_ppc440.S
|
||||
IDAMAXKERNEL = iamax_ppc440.S
|
||||
@@ -52,8 +54,11 @@ ZNRM2KERNEL = znrm2_ppc440.S
|
||||
|
||||
SROTKERNEL = rot_ppc440.S
|
||||
DROTKERNEL = rot_ppc440.S
|
||||
CROTKERNEL = zrot_ppc440.S
|
||||
ZROTKERNEL = zrot_ppc440.S
|
||||
#CROTKERNEL = zrot_ppc440.S
|
||||
#ZROTKERNEL = zrot_ppc440.S
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
||||
|
||||
SSCALKERNEL = scal_ppc440.S
|
||||
DSCALKERNEL = scal_ppc440.S
|
||||
@@ -78,13 +83,18 @@ DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = zgemm_kernel_altivec_g4.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
#CGEMMKERNEL = zgemm_kernel_altivec_g4.S
|
||||
#CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
#CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMKERNEL = zgemm_kernel.S
|
||||
CGEMMINCOPY =
|
||||
CGEMMONCOPY =
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMINCOPYOBJ =
|
||||
#cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ =
|
||||
#cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_g4.S
|
||||
|
||||
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "casum_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
@@ -13,7 +13,11 @@
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef CONJ
|
||||
caxpyc_k:
|
||||
#else
|
||||
caxpy_k:
|
||||
#endif
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
|
||||
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "ccopy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
||||
@@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/zdot.c"
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
#ifndef HAVE_KERNEL_8
|
||||
@@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||
return (result);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -424,7 +424,7 @@ L999:
|
||||
lwz r16, 204(SP)
|
||||
lwz r15, 208(SP)
|
||||
lwz r14, 212(SP)
|
||||
addi r11, 224
|
||||
addi r11, SP, 224
|
||||
#endif
|
||||
lvx v20, r11, r0
|
||||
addi r11, r11, 16
|
||||
@@ -459,4 +459,4 @@ L999:
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif^
|
||||
#endif
|
||||
|
||||
286
kernel/power/cgemm_kernel_power10.S
Normal file
286
kernel/power/cgemm_kernel_power10.S
Normal file
@@ -0,0 +1,286 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
|
||||
#define LOAD ld
|
||||
#define STACKSIZE (512 )
|
||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
|
||||
|
||||
#define alpha_r vs51
|
||||
#define alpha_i vs55
|
||||
#define save_permute_1 vs59
|
||||
#define permute_mask vs63
|
||||
#define o0 0
|
||||
|
||||
|
||||
#define T1 r11
|
||||
#define T2 r12
|
||||
#define T3 r14
|
||||
#define T4 r15
|
||||
#define T5 r16
|
||||
#define T6 r17
|
||||
#define L r18
|
||||
#define T7 r19
|
||||
#define T8 r20
|
||||
#define TEMP_REG r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define T9 r27
|
||||
#define T10 r28
|
||||
#define PRE r29
|
||||
|
||||
#define T12 r30
|
||||
#define T13 r31
|
||||
|
||||
#include "cgemm_macros_power10.S"
|
||||
|
||||
.equ perm_const1, 0x0405060700010203
|
||||
.equ perm_const2, 0x0c0d0e0f08090a0b
|
||||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
|
||||
.equ save_permute_11, 0x0405060714151617
|
||||
|
||||
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
mflr r0
|
||||
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
|
||||
|
||||
stxv vs52, 288(SP)
|
||||
stxv vs53, 304(SP)
|
||||
stxv vs54, 320(SP)
|
||||
stxv vs55, 336(SP)
|
||||
stxv vs56, 352(SP)
|
||||
stxv vs57, 368(SP)
|
||||
stxv vs58, 384(SP)
|
||||
stxv vs59, 400(SP)
|
||||
stxv vs60, 416(SP)
|
||||
stxv vs61, 432(SP)
|
||||
stxv vs62, 448(SP)
|
||||
stxv vs63, 464(SP)
|
||||
std r0, FLINK_SAVE(SP)
|
||||
|
||||
|
||||
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
|
||||
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#endif
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
|
||||
|
||||
|
||||
/*alpha is stored in f1. convert to single and splat*/
|
||||
xscvdpspn alpha_r,vs1
|
||||
xscvdpspn alpha_i,vs2
|
||||
xxspltw alpha_r,alpha_r,0
|
||||
xxspltw alpha_i,alpha_i,0
|
||||
/*load reverse permute mask for big endian
|
||||
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||
*/
|
||||
|
||||
lis T2, perm_const2@highest
|
||||
lis T1, perm_const1@highest
|
||||
lis T3, save_permute_12@highest
|
||||
lis T4, save_permute_11@highest
|
||||
|
||||
|
||||
ori T2, T2, perm_const2@higher
|
||||
ori T1, T1, perm_const1@higher
|
||||
ori T3, T3, save_permute_12@higher
|
||||
ori T4, T4, save_permute_11@higher
|
||||
|
||||
|
||||
rldicr T2, T2, 32, 31
|
||||
rldicr T1, T1, 32, 31
|
||||
rldicr T3, T3, 32, 31
|
||||
rldicr T4, T4, 32, 31
|
||||
|
||||
oris T2, T2, perm_const2@h
|
||||
oris T1, T1, perm_const1@h
|
||||
oris T3, T3, save_permute_12@h
|
||||
oris T4, T4, save_permute_11@h
|
||||
|
||||
|
||||
ori T2, T2, perm_const2@l
|
||||
ori T1, T1, perm_const1@l
|
||||
ori T3, T3, save_permute_12@l
|
||||
ori T4, T4, save_permute_11@l
|
||||
|
||||
|
||||
li r0,0
|
||||
li PRE,512
|
||||
|
||||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
|
||||
/*negate for this case as we will use addition -1*(a+b) */
|
||||
xvnegsp alpha_r,alpha_r
|
||||
xvnegsp alpha_i,alpha_i
|
||||
#endif
|
||||
|
||||
mtvsrdd permute_mask,T2,T1
|
||||
mtvsrdd save_permute_1,T3,T4
|
||||
|
||||
/*mask is reverse permute so we have to make it inner permute */
|
||||
xxpermdi permute_mask, permute_mask, permute_mask,2
|
||||
|
||||
#include "cgemm_logic_power10.S"
|
||||
|
||||
.L999:
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
|
||||
ld r0, FLINK_SAVE(SP)
|
||||
|
||||
lxv vs52, 288(SP)
|
||||
lxv vs53, 304(SP)
|
||||
lxv vs54, 320(SP)
|
||||
lxv vs55, 336(SP)
|
||||
lxv vs56, 352(SP)
|
||||
lxv vs57, 368(SP)
|
||||
lxv vs58, 384(SP)
|
||||
lxv vs59, 400(SP)
|
||||
mtlr r0
|
||||
lxv vs60, 416(SP)
|
||||
lxv vs61, 432(SP)
|
||||
lxv vs62, 448(SP)
|
||||
lxv vs63, 464(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
||||
2814
kernel/power/cgemm_logic_power10.S
Normal file
2814
kernel/power/cgemm_logic_power10.S
Normal file
File diff suppressed because it is too large
Load Diff
2131
kernel/power/cgemm_macros_power10.S
Normal file
2131
kernel/power/cgemm_macros_power10.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/zgemv_n.c"
|
||||
#else
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
@@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/zgemv_t.c"
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
|
||||
@@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -27,7 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||
{
|
||||
@@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
@@ -183,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
BLASLONG n1 = n & -8;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
@@ -191,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||
i=n1;
|
||||
ix=2*n1;
|
||||
}
|
||||
|
||||
#endif
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[ix] ;
|
||||
|
||||
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "cswap_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dasum_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "daxpy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
131
kernel/power/daxpy_microk_power10.c
Normal file
131
kernel/power/daxpy_microk_power10.c
Normal file
@@ -0,0 +1,131 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
||||
{
|
||||
__vector double t0;
|
||||
|
||||
__asm__
|
||||
(
|
||||
XXSPLTD_S(%x4,%x6,0)
|
||||
|
||||
"dcbt 0, %2 \n\t"
|
||||
"dcbt 0, %3 \n\t"
|
||||
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 40, 64(%2) \n\t"
|
||||
"lxvp 42, 96(%2) \n\t"
|
||||
|
||||
"lxvp 36, 0(%3) \n\t"
|
||||
"lxvp 38, 32(%3) \n\t"
|
||||
"lxvp 44, 64(%3) \n\t"
|
||||
"lxvp 46, 96(%3) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmaddadp 36, 32, %x4 \n\t"
|
||||
"xvmaddadp 37, 33, %x4 \n\t"
|
||||
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"stxvp 36, 0(%3) \n\t"
|
||||
|
||||
"xvmaddadp 38, 34, %x4 \n\t"
|
||||
"xvmaddadp 39, 35, %x4 \n\t"
|
||||
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"stxvp 38, 32(%3) \n\t"
|
||||
|
||||
|
||||
"lxvp 36, 128(%3) \n\t"
|
||||
"lxvp 38, 160(%3) \n\t"
|
||||
|
||||
"xvmaddadp 44, 40, %x4 \n\t"
|
||||
"xvmaddadp 45, 41, %x4 \n\t"
|
||||
|
||||
"lxvp 40, 64(%2) \n\t"
|
||||
"stxvp 44, 64(%3) \n\t"
|
||||
|
||||
"xvmaddadp 46, 42, %x4 \n\t"
|
||||
"xvmaddadp 47, 43, %x4 \n\t"
|
||||
|
||||
"lxvp 42, 96(%2) \n\t"
|
||||
"stxvp 46, 96(%3) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"lxvp 44, 64(%3) \n\t"
|
||||
"lxvp 46, 96(%3) \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmaddadp 36, 32, %x4 \n\t"
|
||||
"xvmaddadp 37, 33, %x4 \n\t"
|
||||
"xvmaddadp 38, 34, %x4 \n\t"
|
||||
"xvmaddadp 39, 35, %x4 \n\t"
|
||||
|
||||
"xvmaddadp 44, 40, %x4 \n\t"
|
||||
"xvmaddadp 45, 41, %x4 \n\t"
|
||||
"xvmaddadp 46, 42, %x4 \n\t"
|
||||
"xvmaddadp 47, 43, %x4 \n\t"
|
||||
|
||||
"stxvp 36, 0(%3) \n\t"
|
||||
"stxvp 38, 32(%3) \n\t"
|
||||
"stxvp 44, 64(%3) \n\t"
|
||||
"stxvp 46, 96(%3) \n\t"
|
||||
|
||||
"#n=%1 x=%5=%2 y=%0=%3 alpha=%6 t0=%x4\n"
|
||||
:
|
||||
"+m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y), // 3
|
||||
"=wa" (t0) // 4
|
||||
:
|
||||
"m" (*x),
|
||||
"d" (alpha) // 6
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
121
kernel/power/daxpy_power10.c
Normal file
121
kernel/power/daxpy_power10.c
Normal file
@@ -0,0 +1,121 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "daxpy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] += alpha * x[i];
|
||||
y[i+1] += alpha * x[i+1];
|
||||
y[i+2] += alpha * x[i+2];
|
||||
y[i+3] += alpha * x[i+3];
|
||||
y[i+4] += alpha * x[i+4];
|
||||
y[i+5] += alpha * x[i+5];
|
||||
y[i+6] += alpha * x[i+6];
|
||||
y[i+7] += alpha * x[i+7];
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
daxpy_kernel_8(n1, x, y, da);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[i] += da * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
FLOAT m1 = da * x[ix] ;
|
||||
FLOAT m2 = da * x[ix+inc_x] ;
|
||||
FLOAT m3 = da * x[ix+2*inc_x] ;
|
||||
FLOAT m4 = da * x[ix+3*inc_x] ;
|
||||
|
||||
y[iy] += m1 ;
|
||||
y[iy+inc_y] += m2 ;
|
||||
y[iy+2*inc_y] += m3 ;
|
||||
y[iy+3*inc_y] += m4 ;
|
||||
|
||||
ix += inc_x*4 ;
|
||||
iy += inc_y*4 ;
|
||||
i+=4 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
y[iy] += da * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dcopy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
||||
134
kernel/power/dcopy_microk_power10.c
Normal file
134
kernel/power/dcopy_microk_power10.c
Normal file
@@ -0,0 +1,134 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_64 1
|
||||
|
||||
static void dcopy_kernel_64 (long n, double *x, double *y)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"lxvp 54, 352(%2) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"lxvp 62, 480(%2) \n\t"
|
||||
"addi %2, %2, 512 \n\t"
|
||||
|
||||
"addic. %1, %1, -64 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
"lxvp 54, 352(%2) \n\t"
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
"lxvp 62, 480(%2) \n\t"
|
||||
|
||||
"addi %3, %3, 512 \n\t"
|
||||
"addi %2, %2, 512 \n\t"
|
||||
|
||||
"addic. %1, %1, -64 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3"
|
||||
:
|
||||
"=m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
|
||||
);
|
||||
}
|
||||
123
kernel/power/dcopy_power10.c
Normal file
123
kernel/power/dcopy_power10.c
Normal file
@@ -0,0 +1,123 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dcopy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_64
|
||||
|
||||
static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=8;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dcopy_kernel_64(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[i] = x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "ddot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
896
kernel/power/dgemm_kernel_power10.c
Normal file
896
kernel/power/dgemm_kernel_power10.c
Normal file
@@ -0,0 +1,896 @@
|
||||
/*********************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||
rowC[0] = result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] = result[3] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] = result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||
rowC[0] = result[3] * alpha;
|
||||
#define SAVE2x4_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] = result[1] * alpha;
|
||||
#else
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE2x4_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] += result[1] * alpha;
|
||||
#endif
|
||||
|
||||
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
#define REFRESH_TEMP_BK(x, y) \
|
||||
temp = k - off;
|
||||
#elif defined(LEFT)
|
||||
#define REFRESH_TEMP_BK(x, y) \
|
||||
temp = off + x;
|
||||
#else
|
||||
#define REFRESH_TEMP_BK(x, y) \
|
||||
temp = off + y;
|
||||
#endif
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
#define REFRESH_POINTERS(x, y) \
|
||||
BO = B; \
|
||||
REFRESH_TEMP_BK(x, y)
|
||||
#else
|
||||
#define REFRESH_POINTERS(x, y) \
|
||||
AO += off * x; \
|
||||
BO = B + off * y; \
|
||||
REFRESH_TEMP_BK(x, y)
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
#define REFRESH_OFF(x) \
|
||||
off += x;
|
||||
#else
|
||||
#define REFRESH_OFF(x)
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
#define UPDATE_TEMP(x, y) \
|
||||
temp -= x;
|
||||
#else
|
||||
#define UPDATE_TEMP(x, y) \
|
||||
temp -= y;
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
#define REFRESH_TMP_AFTER_SAVE(x, y) \
|
||||
temp = k - off; \
|
||||
UPDATE_TEMP(x, y) \
|
||||
AO += temp * x; \
|
||||
BO += temp * y;
|
||||
#else
|
||||
#define REFRESH_TMP_AFTER_SAVE(x, y)
|
||||
#endif
|
||||
|
||||
#define REFRESH_AFTER_SAVE(x,y) \
|
||||
REFRESH_TMP_AFTER_SAVE(x, y) \
|
||||
REFRESH_OFF(x)
|
||||
/*************************************************************************************
|
||||
* GEMM Kernel
|
||||
*************************************************************************************/
|
||||
int
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
FLOAT * C, BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
, BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG N = n;
|
||||
BLASLONG i1;
|
||||
#if defined(TRMMKERNEL)
|
||||
BLASLONG off;
|
||||
#endif
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off = -offset;
|
||||
#endif
|
||||
v4sf_t valpha = { alpha, alpha };
|
||||
N = n >> 2;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
{
|
||||
BLASLONG i, j, temp;
|
||||
FLOAT *CO;
|
||||
FLOAT *AO;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
CO = C;
|
||||
C += ldc << 2;
|
||||
AO = A;
|
||||
PREFETCH1 (A, 128);
|
||||
PREFETCH1 (A, 256);
|
||||
i = m >> 4;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (16, 4);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
BLASLONG l = 0;
|
||||
PREFETCH1 (CO, 0);
|
||||
PREFETCH1 (CO + ldc, 0);
|
||||
PREFETCH1 (CO + ldc + ldc, 0);
|
||||
PREFETCH1 (CO + ldc + ldc + ldc, 0);
|
||||
PREFETCH1 (CO, 128);
|
||||
PREFETCH1 (CO + ldc, 128);
|
||||
PREFETCH1 (CO + ldc + ldc, 128);
|
||||
PREFETCH1 (CO + ldc + ldc + ldc, 128);
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
|
||||
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
|
||||
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
|
||||
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 4];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
|
||||
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
|
||||
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
|
||||
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC (&acc2, 4);
|
||||
SAVE_ACC (&acc1, 2);
|
||||
SAVE_ACC (&acc3, 6);
|
||||
SAVE_ACC (&acc4, 8);
|
||||
SAVE_ACC (&acc6, 12);
|
||||
SAVE_ACC (&acc5, 10);
|
||||
SAVE_ACC (&acc7, 14);
|
||||
AO += temp << 4;
|
||||
BO += temp << 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (16, 4)
|
||||
#endif
|
||||
CO += 16;
|
||||
}
|
||||
i = (m & 15) >> 3;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 4);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
BLASLONG l = 0;
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 3];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC (&acc2, 4);
|
||||
SAVE_ACC (&acc1, 2);
|
||||
SAVE_ACC (&acc3, 6);
|
||||
CO += 8;
|
||||
AO += temp << 3;
|
||||
BO += temp << 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (8, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 7) >> 2;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 4);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1;
|
||||
BLASLONG l = 0;
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 2];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC (&acc1, 2);
|
||||
CO += 4;
|
||||
AO += temp << 2;
|
||||
BO += temp << 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 3) >> 1;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 4);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0;
|
||||
BLASLONG l = 0;
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 1];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
CO += 2;
|
||||
AO += temp << 1;
|
||||
BO += temp << 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (2, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 1) >> 0;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 4);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0 };
|
||||
v4sf_t t1 = { 0, 0 };
|
||||
for (l = 0; l < temp; l++)
|
||||
{
|
||||
v4sf_t rowA = { AO[l], AO[l] };
|
||||
v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
|
||||
v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
|
||||
t += rowA * rowB;
|
||||
t1 += rowA * rowB1;
|
||||
}
|
||||
t = t * valpha;
|
||||
t1 = t1 * valpha;
|
||||
#if defined(TRMMKERNEL)
|
||||
CO[0 * ldc] = t[0];
|
||||
CO[1 * ldc] = t[1];
|
||||
CO[2 * ldc] = t1[0];
|
||||
CO[3 * ldc] = t1[1];
|
||||
#else
|
||||
CO[0 * ldc] += t[0];
|
||||
CO[1 * ldc] += t[1];
|
||||
CO[2 * ldc] += t1[0];
|
||||
CO[3 * ldc] += t1[1];
|
||||
#endif
|
||||
CO += 1;
|
||||
AO += temp;
|
||||
BO += temp << 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (1, 4)
|
||||
#endif
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
B += k << 2;
|
||||
}
|
||||
N = (n & 3) >> 1;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
{
|
||||
BLASLONG i, j, temp;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
FLOAT *CO;
|
||||
FLOAT *AO;
|
||||
CO = C;
|
||||
C += ldc << 1;
|
||||
AO = A;
|
||||
i = m >> 4;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (16, 2);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
|
||||
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
|
||||
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
|
||||
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 4];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
|
||||
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
|
||||
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
|
||||
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
|
||||
}
|
||||
SAVE2x4_ACC (&acc0, 0);
|
||||
SAVE2x4_ACC (&acc1, 2);
|
||||
SAVE2x4_ACC (&acc2, 4);
|
||||
SAVE2x4_ACC (&acc3, 6);
|
||||
SAVE2x4_ACC (&acc4, 8);
|
||||
SAVE2x4_ACC (&acc5, 10);
|
||||
SAVE2x4_ACC (&acc6, 12);
|
||||
SAVE2x4_ACC (&acc7, 14);
|
||||
CO += 16;
|
||||
AO += temp << 4;
|
||||
BO += temp << 1;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (16, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 15) >> 3;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 2);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 3];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||
}
|
||||
SAVE2x4_ACC (&acc0, 0);
|
||||
SAVE2x4_ACC (&acc1, 2);
|
||||
SAVE2x4_ACC (&acc2, 4);
|
||||
SAVE2x4_ACC (&acc3, 6);
|
||||
CO += 8;
|
||||
AO += temp << 3;
|
||||
BO += temp << 1;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (8, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 7) >> 2;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 2);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 2];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
}
|
||||
SAVE2x4_ACC (&acc0, 0);
|
||||
SAVE2x4_ACC (&acc1, 2);
|
||||
CO += 4;
|
||||
AO += temp << 2;
|
||||
BO += temp << 1;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 3) >> 1;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 2);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 1];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
}
|
||||
SAVE2x4_ACC (&acc0, 0);
|
||||
CO += 2;
|
||||
AO += temp << 1;
|
||||
BO += temp << 1;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (2, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 1) >> 0;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 2);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0 };
|
||||
for (l = 0; l < temp; l++)
|
||||
{
|
||||
v4sf_t rowA = { AO[l], AO[l] };
|
||||
v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
|
||||
t += rowA * rowB;
|
||||
}
|
||||
t = t * valpha;
|
||||
#if defined(TRMMKERNEL)
|
||||
CO[0 * ldc] = t[0];
|
||||
CO[1 * ldc] = t[1];
|
||||
#else
|
||||
CO[0 * ldc] += t[0];
|
||||
CO[1 * ldc] += t[1];
|
||||
#endif
|
||||
CO += 1;
|
||||
AO += temp;
|
||||
BO += temp << 1;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (1, 2)
|
||||
#endif
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
B += k << 1;
|
||||
}
|
||||
N = (n & 1) >> 0;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
{
|
||||
BLASLONG i, temp;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
FLOAT *CO;
|
||||
FLOAT *AO;
|
||||
CO = C;
|
||||
C += ldc;
|
||||
AO = A;
|
||||
i = m;
|
||||
while (i >= 16)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (16, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0 };
|
||||
v4sf_t t1 = { 0, 0 };
|
||||
v4sf_t t2 = { 0, 0 };
|
||||
v4sf_t t3 = { 0, 0 };
|
||||
v4sf_t t4 = { 0, 0 };
|
||||
v4sf_t t5 = { 0, 0 };
|
||||
v4sf_t t6 = { 0, 0 };
|
||||
v4sf_t t7 = { 0, 0 };
|
||||
for (l = 0; l < temp; l++)
|
||||
{
|
||||
v4sf_t rowB = { BO[l], BO[l] };
|
||||
v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
|
||||
v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
|
||||
v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
|
||||
v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
|
||||
v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
|
||||
v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
|
||||
v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
|
||||
v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
|
||||
t += rowA * rowB;
|
||||
t1 += rowA1 * rowB;
|
||||
t2 += rowA2 * rowB;
|
||||
t3 += rowA3 * rowB;
|
||||
t4 += rowA4 * rowB;
|
||||
t5 += rowA5 * rowB;
|
||||
t6 += rowA6 * rowB;
|
||||
t7 += rowA7 * rowB;
|
||||
}
|
||||
t = t * valpha;
|
||||
t1 = t1 * valpha;
|
||||
t2 = t2 * valpha;
|
||||
t3 = t3 * valpha;
|
||||
t4 = t4 * valpha;
|
||||
t5 = t5 * valpha;
|
||||
t6 = t6 * valpha;
|
||||
t7 = t7 * valpha;
|
||||
#if defined(TRMMKERNEL)
|
||||
CO[0] = t[0];
|
||||
CO[1] = t[1];
|
||||
CO[2] = t1[0];
|
||||
CO[3] = t1[1];
|
||||
CO[4] = t2[0];
|
||||
CO[5] = t2[1];
|
||||
CO[6] = t3[0];
|
||||
CO[7] = t3[1];
|
||||
CO[8] = t4[0];
|
||||
CO[9] = t4[1];
|
||||
CO[10] = t5[0];
|
||||
CO[11] = t5[1];
|
||||
CO[12] = t6[0];
|
||||
CO[13] = t6[1];
|
||||
CO[14] = t7[0];
|
||||
CO[15] = t7[1];
|
||||
#else
|
||||
CO[0] += t[0];
|
||||
CO[1] += t[1];
|
||||
CO[2] += t1[0];
|
||||
CO[3] += t1[1];
|
||||
CO[4] += t2[0];
|
||||
CO[5] += t2[1];
|
||||
CO[6] += t3[0];
|
||||
CO[7] += t3[1];
|
||||
CO[8] += t4[0];
|
||||
CO[9] += t4[1];
|
||||
CO[10] += t5[0];
|
||||
CO[11] += t5[1];
|
||||
CO[12] += t6[0];
|
||||
CO[13] += t6[1];
|
||||
CO[14] += t7[0];
|
||||
CO[15] += t7[1];
|
||||
#endif
|
||||
AO += temp << 4;
|
||||
BO += temp;
|
||||
CO += 16;
|
||||
i -= 16;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (16, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 8)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0 };
|
||||
v4sf_t t1 = { 0, 0 };
|
||||
v4sf_t t2 = { 0, 0 };
|
||||
v4sf_t t3 = { 0, 0 };
|
||||
for (l = 0; l < temp; l++)
|
||||
{
|
||||
v4sf_t rowB = { BO[l], BO[l] };
|
||||
v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
|
||||
v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
|
||||
v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
|
||||
v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
|
||||
t += rowA * rowB;
|
||||
t1 += rowA1 * rowB;
|
||||
t2 += rowA2 * rowB;
|
||||
t3 += rowA3 * rowB;
|
||||
}
|
||||
t = t * valpha;
|
||||
t1 = t1 * valpha;
|
||||
t2 = t2 * valpha;
|
||||
t3 = t3 * valpha;
|
||||
#if defined(TRMMKERNEL)
|
||||
CO[0] = t[0];
|
||||
CO[1] = t[1];
|
||||
CO[2] = t1[0];
|
||||
CO[3] = t1[1];
|
||||
CO[4] = t2[0];
|
||||
CO[5] = t2[1];
|
||||
CO[6] = t3[0];
|
||||
CO[7] = t3[1];
|
||||
#else
|
||||
CO[0] += t[0];
|
||||
CO[1] += t[1];
|
||||
CO[2] += t1[0];
|
||||
CO[3] += t1[1];
|
||||
CO[4] += t2[0];
|
||||
CO[5] += t2[1];
|
||||
CO[6] += t3[0];
|
||||
CO[7] += t3[1];
|
||||
#endif
|
||||
AO += temp << 3;
|
||||
BO += temp;
|
||||
CO += 8;
|
||||
i -= 8;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (8, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 4)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0 };
|
||||
v4sf_t t1 = { 0, 0 };
|
||||
for (l = 0; l < temp; l++)
|
||||
{
|
||||
v4sf_t rowB = { BO[l], BO[l] };
|
||||
v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
|
||||
v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
|
||||
t += rowA * rowB;
|
||||
t1 += rowA1 * rowB;
|
||||
}
|
||||
t = t * valpha;
|
||||
t1 = t1 * valpha;
|
||||
#if defined(TRMMKERNEL)
|
||||
CO[0] = t[0];
|
||||
CO[1] = t[1];
|
||||
CO[2] = t1[0];
|
||||
CO[3] = t1[1];
|
||||
#else
|
||||
CO[0] += t[0];
|
||||
CO[1] += t[1];
|
||||
CO[2] += t1[0];
|
||||
CO[3] += t1[1];
|
||||
#endif
|
||||
AO += temp << 2;
|
||||
BO += temp;
|
||||
CO += 4;
|
||||
i -= 4;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 2)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0 };
|
||||
for (l = 0; l < temp; l++)
|
||||
{
|
||||
v4sf_t rowB = { BO[l], BO[l] };
|
||||
v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
|
||||
t += rowA * rowB;
|
||||
}
|
||||
t = t * valpha;
|
||||
#if defined(TRMMKERNEL)
|
||||
CO[0] = t[0];
|
||||
CO[1] = t[1];
|
||||
#else
|
||||
CO[0] += t[0];
|
||||
CO[1] += t[1];
|
||||
#endif
|
||||
AO += temp << 1;
|
||||
BO += temp;
|
||||
CO += 2;
|
||||
i -= 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (2, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 1)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
BLASLONG l = 0;
|
||||
FLOAT t = 0;
|
||||
for (l = 0; l < temp; l++)
|
||||
{
|
||||
t += AO[l] * BO[l];
|
||||
}
|
||||
AO += temp;
|
||||
BO += temp;
|
||||
#if defined(TRMMKERNEL)
|
||||
CO[0] = t * alpha;
|
||||
#else
|
||||
CO[0] += t * alpha;
|
||||
#endif
|
||||
CO += 1;
|
||||
i -= 1;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (1, 1)
|
||||
#endif
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
B += k;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -38,9 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dgemv_n_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#define NBMAX 4096
|
||||
|
||||
268
kernel/power/dgemv_n_microk_power10.c
Normal file
268
kernel/power/dgemv_n_microk_power10.c
Normal file
@@ -0,0 +1,268 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
|
||||
static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
|
||||
{
|
||||
double *a0;
|
||||
double *a1;
|
||||
double *a2;
|
||||
double *a3;
|
||||
|
||||
__asm__
|
||||
(
|
||||
"lxvp 40, 0(%10) \n\t" // x0, x1
|
||||
XXSPLTD_S(32,%x9,0) // alpha, alpha
|
||||
|
||||
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
|
||||
|
||||
"xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha
|
||||
"xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha
|
||||
|
||||
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
|
||||
"add %6, %6, %6 \n\t" // 2 * lda
|
||||
|
||||
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
|
||||
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
|
||||
|
||||
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
|
||||
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
|
||||
|
||||
"dcbt 0, %3 \n\t"
|
||||
"dcbt 0, %4 \n\t"
|
||||
"dcbt 0, %5 \n\t"
|
||||
"dcbt 0, %6 \n\t"
|
||||
|
||||
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
|
||||
|
||||
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
|
||||
|
||||
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
|
||||
|
||||
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
|
||||
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
||||
"addi %3, %3, 32 \n\t"
|
||||
"addi %4, %4, 32 \n\t"
|
||||
"addi %5, %5, 32 \n\t"
|
||||
"addi %6, %6, 32 \n\t"
|
||||
|
||||
"addic. %1, %1, -4 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"lxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
|
||||
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
|
||||
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"addi %3, %3, 32 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
|
||||
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
|
||||
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"addi %4, %4, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
|
||||
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
|
||||
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"addi %5, %5, 32 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %1, %1, -4 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
|
||||
"lxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
|
||||
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
|
||||
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"addi %3, %3, 32 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
|
||||
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
|
||||
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"addi %4, %4, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
|
||||
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
|
||||
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"addi %5, %5, 32 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %1, %1, -4 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
|
||||
"lxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
|
||||
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
|
||||
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"addi %3, %3, 32 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
|
||||
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
|
||||
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"addi %4, %4, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
|
||||
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
|
||||
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"addi %5, %5, 32 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %1, %1, -4 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
|
||||
"lxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
|
||||
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
|
||||
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"addi %3, %3, 32 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
|
||||
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
|
||||
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"addi %4, %4, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
|
||||
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
|
||||
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"addi %5, %5, 32 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
|
||||
|
||||
"addi %6, %6, 32 \n\t"
|
||||
"addi %2, %2, 32 \n\t"
|
||||
|
||||
"addic. %1, %1, -4 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"lxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
|
||||
"stxvp 36, 0(%2) \n\t" // y0, y1
|
||||
|
||||
"#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n"
|
||||
"#a0=%3 a1=%4 a2=%5 a3=%6"
|
||||
:
|
||||
"+m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (y), // 2
|
||||
"=b" (a0), // 3
|
||||
"=b" (a1), // 4
|
||||
"=&b" (a2), // 5
|
||||
"=&b" (a3) // 6
|
||||
:
|
||||
"m" (*x),
|
||||
"m" (*ap),
|
||||
"d" (alpha), // 9
|
||||
"r" (x), // 10
|
||||
"b" (16), // 11
|
||||
"3" (ap), // 12
|
||||
"4" (lda) // 13
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
|
||||
);
|
||||
}
|
||||
565
kernel/power/dgemv_n_power10.c
Normal file
565
kernel/power/dgemv_n_power10.c
Normal file
@@ -0,0 +1,565 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
typedef __vector_pair __attribute__((aligned(8))) vecp_t;
|
||||
|
||||
#include "dgemv_n_microk_power10.c"
|
||||
|
||||
#define MMA(X, APTR, ACC) \
|
||||
rX = (vec_t *) & X; \
|
||||
rowA = *((vecp_t*)((void*)&APTR)); \
|
||||
__builtin_mma_xvf64gerpp (ACC, rowA, rX[0]);
|
||||
|
||||
#define SAVE(ACC, Z) \
|
||||
rowC = (v4sf_t *) &y[Z]; \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
result[0][1] = result[1][0]; \
|
||||
result[2][1] = result[3][0]; \
|
||||
rowC[0] += valpha * result[0]; \
|
||||
rowC[1] += valpha * result[2];
|
||||
|
||||
void
|
||||
dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo,
|
||||
FLOAT * y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i, j, tmp;
|
||||
FLOAT *a0 = a_ptr;
|
||||
FLOAT *x1 = xo;
|
||||
vector double valpha = { alpha, alpha };
|
||||
v4sf_t *rowC;
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
v4sf_t result[4];
|
||||
vecp_t rowA;
|
||||
vec_t *rX;
|
||||
tmp = (n / 32) * 32;
|
||||
for (i = 0; i < tmp; i += 32)
|
||||
{
|
||||
xo = x1;
|
||||
a0 = a_ptr;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
__builtin_mma_xxsetaccz (&acc1);
|
||||
__builtin_mma_xxsetaccz (&acc2);
|
||||
__builtin_mma_xxsetaccz (&acc3);
|
||||
__builtin_mma_xxsetaccz (&acc4);
|
||||
__builtin_mma_xxsetaccz (&acc5);
|
||||
__builtin_mma_xxsetaccz (&acc6);
|
||||
__builtin_mma_xxsetaccz (&acc7);
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
|
||||
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
|
||||
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
|
||||
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
|
||||
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
|
||||
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
|
||||
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
|
||||
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
|
||||
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
|
||||
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
|
||||
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
|
||||
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
|
||||
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
|
||||
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
|
||||
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
|
||||
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
|
||||
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
|
||||
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
|
||||
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
|
||||
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
|
||||
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
|
||||
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
|
||||
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
|
||||
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
|
||||
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
|
||||
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
|
||||
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
|
||||
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
|
||||
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
SAVE (&acc0, i + 0);
|
||||
SAVE (&acc1, i + 4);
|
||||
SAVE (&acc2, i + 8);
|
||||
SAVE (&acc3, i + 12);
|
||||
SAVE (&acc4, i + 16);
|
||||
SAVE (&acc5, i + 20);
|
||||
SAVE (&acc6, i + 24);
|
||||
SAVE (&acc7, i + 28);
|
||||
|
||||
}
|
||||
for (i = tmp; i < n; i += 4)
|
||||
{
|
||||
xo = x1;
|
||||
a0 = a_ptr;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + j * lda], &acc0);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + j * lda], &acc0);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + j * lda], &acc0);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
for (j = 0; j < 32; j++)
|
||||
{
|
||||
__builtin_prefetch (xo+j);
|
||||
__builtin_prefetch (a0+i+j+lda);
|
||||
MMA (xo[j], a0[i + j * lda], &acc0);
|
||||
}
|
||||
xo += 32;
|
||||
a0 += lda << 5;
|
||||
SAVE (&acc0, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define NBMAX 4096
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
FLOAT *a0 = a_ptr;
|
||||
FLOAT *a1 = a0 + lda;
|
||||
FLOAT *a2 = a1 + lda;
|
||||
FLOAT *a3 = a2 + lda;
|
||||
|
||||
|
||||
for ( i=0; i<4; i++)
|
||||
x[i] = xo[i] * alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
|
||||
for ( i=0; i<2; i++)
|
||||
x[i] = xo[i] * alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0] + a1[i]*x[1];
|
||||
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
|
||||
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
|
||||
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x1
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT x[4] __attribute__ ((aligned (16)));;
|
||||
|
||||
for ( i=0; i<1; i++)
|
||||
x[i] = xo[i] * alpha;
|
||||
|
||||
for ( i=0; i< n; i+=4 )
|
||||
{
|
||||
y[i] += a0[i]*x[0];
|
||||
y[i+1] += a0[i+1]*x[0];
|
||||
y[i+2] += a0[i+2]*x[0];
|
||||
y[i+3] += a0[i+3]*x[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
if ( inc_dest != 1 )
|
||||
{
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
BLASLONG lda128 = lda << 7;
|
||||
|
||||
FLOAT xbuffer[8] __attribute__ ((aligned (16)));
|
||||
FLOAT *ybuffer;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
BLASLONG n128 = n >> 7;
|
||||
n1 = (n - (n128 * 128)) >> 2;
|
||||
n2 = (n - (n128 * 128)) & 3;
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if ( inc_y != 1 )
|
||||
memset(ybuffer,0,NB*8);
|
||||
else
|
||||
ybuffer = y_ptr;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < n128 ; i++)
|
||||
{
|
||||
dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
|
||||
a_ptr += lda128;
|
||||
x_ptr += 128;
|
||||
}
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
|
||||
a_ptr += lda4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha);
|
||||
a_ptr += lda*2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha);
|
||||
a_ptr += lda;
|
||||
x_ptr += 1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for( i = 0; i < n128 ; i++)
|
||||
{
|
||||
FLOAT xbuffer[128] __attribute__ ((aligned (16)));
|
||||
BLASLONG j;
|
||||
for ( j = 0; j < 128 ; j++)
|
||||
{
|
||||
xbuffer[j] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
|
||||
a_ptr += lda128;
|
||||
}
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[1] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[3] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
if ( inc_y != 1 )
|
||||
{
|
||||
add_y(NB,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
else
|
||||
y_ptr += NB ;
|
||||
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
if ( lda == 3 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
||||
{
|
||||
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
|
||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
|
||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
|
||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
|
||||
|
||||
a_ptr += 12;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += 3;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp2;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
if ( lda == 2 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4) ; i+=4 )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += 2;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp = 0.0;
|
||||
if ( lda == 1 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4); i+=4 )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
||||
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i];
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[0] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
@@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/gemv_t.c"
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 1024
|
||||
//#define PREFETCH 1
|
||||
|
||||
#include <altivec.h>
|
||||
|
||||
#define HAVE_KERNEL4x8_ASM 1
|
||||
|
||||
|
||||
#if defined(HAVE_KERNEL4x8_ASM)
|
||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
|
||||
|
||||
@@ -355,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||
"stxvd2x 39, %[off], %[y] \n\t"
|
||||
"stxvd2x 40, %[off2], %[y] \n\t"
|
||||
|
||||
: [memy] "+m" (*(const double (*)[8])y),
|
||||
: [memy] "+m" (*(double (*)[8])y),
|
||||
[n] "+&r" (n),
|
||||
[a0] "=b" (a0),
|
||||
[a1] "=&b" (a1),
|
||||
@@ -369,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||
[off2]"=&b" (off2),
|
||||
[temp] "=&b" (tempR)
|
||||
: [memx] "m" (*(const double (*)[n])x),
|
||||
[mem_ap] "m" (*(const double (*)[]) ap),
|
||||
[mem_ap] "m" (*(const double (*)[n*8]) ap),
|
||||
[alpha] "d" (alpha),
|
||||
"[a0]" (ap),
|
||||
[x] "b" (x),
|
||||
@@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||
return (0);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
840
kernel/power/dgemv_t_power10.c
Normal file
840
kernel/power/dgemv_t_power10.c
Normal file
@@ -0,0 +1,840 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 1024
|
||||
//#define PREFETCH 1
|
||||
#include <altivec.h>
|
||||
|
||||
#define HAVE_KERNEL4x8_ASM 1
|
||||
|
||||
|
||||
#if defined(HAVE_KERNEL4x8_ASM)
|
||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
|
||||
|
||||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
|
||||
BLASLONG off2;
|
||||
BLASLONG tempR;
|
||||
__asm__(
|
||||
|
||||
"sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2
|
||||
"sldi %[off], %[off], 3 \n\t" // lda * sizeof (double)
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,34,34 \n\t"
|
||||
"add %[a2], %[a0], %[temp] \n\t"
|
||||
"add %[a1], %[a0], %[off] \n\t"
|
||||
"xxlxor 4,34,34 \n\t"
|
||||
"xxlxor 5,34,34 \n\t"
|
||||
"xxlxor 6,34,34 \n\t"
|
||||
"xxlxor 7,34,34 \n\t"
|
||||
"add %[a3], %[a2], %[off] \n\t"
|
||||
"add %[a4], %[a2], %[temp] \n\t"
|
||||
|
||||
"xxlxor 8,34,34 \n\t"
|
||||
"xxlxor 9,34,34 \n\t"
|
||||
"add %[a5], %[a3], %[temp] \n\t"
|
||||
"li %[off],0 \n\t"
|
||||
"li %[off2],16 \n\t"
|
||||
|
||||
"add %[a6], %[a4], %[temp] \n\t"
|
||||
"add %[a7], %[a5], %[temp] \n\t"
|
||||
|
||||
|
||||
|
||||
|
||||
"lxvp 32, 0(%[x]) \n\t"
|
||||
"lxvp 36, 0(%[a0]) \n\t"
|
||||
"lxvp 38, 0(%[a1]) \n\t"
|
||||
"lxvp 40, 0(%[a2]) \n\t"
|
||||
"lxvp 42, 0(%[a3]) \n\t"
|
||||
"lxvp 44, 0(%[a4]) \n\t"
|
||||
"lxvp 46, 0(%[a5]) \n\t"
|
||||
"lxvp 48, 0(%[a6]) \n\t"
|
||||
"lxvp 50, 0(%[a7]) \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"li %[temp],896 \n\t"
|
||||
#endif
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
|
||||
"li %[off],32 \n\t"
|
||||
|
||||
|
||||
"ble- two%= \n\t"
|
||||
|
||||
//--------------------------------------------------
|
||||
".align 5 \n\t"
|
||||
"one%=: \n\t"
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvp 36, 32(%[a0]) \n\t"
|
||||
"lxvp 38, 32(%[a1]) \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvp 40, 32(%[a2]) \n\t"
|
||||
"lxvp 42, 32(%[a3]) \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvp 44, 32(%[a4]) \n\t"
|
||||
"lxvp 46, 32(%[a5]) \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"lxvp 48, 32(%[a6]) \n\t"
|
||||
"lxvp 50, 32(%[a7]) \n\t"
|
||||
"lxvp 32, 32(%[x]) \n\t"
|
||||
"ble- two%= \n\t"
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvp 36, 64(%[a0]) \n\t"
|
||||
"lxvp 38, 64(%[a1]) \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvp 40, 64(%[a2]) \n\t"
|
||||
"lxvp 42, 64(%[a3]) \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvp 44, 64(%[a4]) \n\t"
|
||||
"lxvp 46, 64(%[a5]) \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"lxvp 48, 64(%[a6]) \n\t"
|
||||
"lxvp 50, 64(%[a7]) \n\t"
|
||||
"lxvp 32, 64(%[x]) \n\t"
|
||||
"ble- two%= \n\t"
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"addi %[temp],%[temp],128 \n\t"
|
||||
#endif
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a0] \n\t"
|
||||
#endif
|
||||
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvp 36, 96(%[a0]) \n\t"
|
||||
"lxvp 38, 96(%[a1]) \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a1] \n\t"
|
||||
#endif
|
||||
"lxvp 40, 96(%[a2]) \n\t"
|
||||
"lxvp 42, 96(%[a3]) \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvp 44, 96(%[a4]) \n\t"
|
||||
"lxvp 46, 96(%[a5]) \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a3] \n\t"
|
||||
#endif
|
||||
"lxvp 48, 96(%[a6]) \n\t"
|
||||
"lxvp 50, 96(%[a7]) \n\t"
|
||||
"lxvp 32, 96(%[x]) \n\t"
|
||||
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"ble- two%= \n\t"
|
||||
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a2] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a4] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a5] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvp 36, 128(%[a0]) \n\t"
|
||||
"lxvp 38, 128(%[a1]) \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvp 40, 128(%[a2]) \n\t"
|
||||
"lxvp 42, 128(%[a3]) \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a6] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvp 44, 128(%[a4]) \n\t"
|
||||
"lxvp 46, 128(%[a5]) \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a7] \n\t"
|
||||
#endif
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"lxvp 48, 128(%[a6]) \n\t"
|
||||
"lxvp 50, 128(%[a7]) \n\t"
|
||||
"lxvp 32, 128(%[x]) \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[x] \n\t"
|
||||
#endif
|
||||
"addi %[a0], %[a0], 128 \n\t"
|
||||
"addi %[a1], %[a1], 128 \n\t"
|
||||
"addi %[a2], %[a2], 128 \n\t"
|
||||
"addi %[a3], %[a3], 128 \n\t"
|
||||
"addi %[a4], %[a4], 128 \n\t"
|
||||
"addi %[a5], %[a5], 128 \n\t"
|
||||
"addi %[a6], %[a6], 128 \n\t"
|
||||
"addi %[a7], %[a7], 128 \n\t"
|
||||
"addi %[x], %[x], 128 \n\t"
|
||||
"bgt+ one%= \n\t"
|
||||
".align 5 \n\t"
|
||||
"two%=: \n\t"
|
||||
//--------------------------------------------
|
||||
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
XXSPLTD_S(36,%x[alpha],0)
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
"lxvp 38, 0(%[y]) \n\t"
|
||||
"lxvp 40, 32(%[y]) \n\t"
|
||||
|
||||
|
||||
|
||||
XXMRGLD_S(42,35,34)
|
||||
XXMRGHD_S(43,35,34)
|
||||
|
||||
XXMRGLD_S(44,5,4)
|
||||
XXMRGHD_S(45,5,4)
|
||||
|
||||
"xvadddp 42,42,43 \n\t"
|
||||
|
||||
XXMRGLD_S(46,7,6)
|
||||
XXMRGHD_S(47,7,6)
|
||||
|
||||
"xvadddp 44,44,45 \n\t"
|
||||
|
||||
XXMRGLD_S(48,9,8)
|
||||
XXMRGHD_S(49,9,8)
|
||||
|
||||
"xvadddp 46,46,47 \n\t"
|
||||
|
||||
"xvmaddadp 39,42,36 \n\t"
|
||||
"xvmaddadp 38,44,36 \n\t"
|
||||
|
||||
"xvadddp 48,48,49 \n\t"
|
||||
|
||||
"xvmaddadp 41,46,36 \n\t"
|
||||
|
||||
"stxvp 38, 0(%[y]) \n\t"
|
||||
"xvmaddadp 40,48,36 \n\t"
|
||||
"stxvp 40, 32(%[y]) \n\t"
|
||||
|
||||
: [memy] "+m" (*(double (*)[8])y),
|
||||
[n] "+&r" (n),
|
||||
[a0] "=b" (a0),
|
||||
[a1] "=&b" (a1),
|
||||
[a2] "=&b" (a2),
|
||||
[a3] "=&b" (a3),
|
||||
[a4] "=&b" (a4),
|
||||
[a5] "=&b" (a5),
|
||||
[a6] "=&b" (a6),
|
||||
[a7] "=&b" (a7),
|
||||
[off] "+&b" (lda),
|
||||
[off2]"=&b" (off2),
|
||||
[temp] "=&b" (tempR)
|
||||
: [memx] "m" (*(const double (*)[n])x),
|
||||
[mem_ap] "m" (*(const double (*)[n*8]) ap),
|
||||
[alpha] "d" (alpha),
|
||||
"[a0]" (ap),
|
||||
[x] "b" (x),
|
||||
[y] "b" (y)
|
||||
: "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39",
|
||||
"vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
||||
);
|
||||
return;
|
||||
}
|
||||
#else
|
||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
BLASLONG i;
|
||||
#if defined(PREFETCH)
|
||||
BLASLONG j, c, k;
|
||||
#endif
|
||||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
|
||||
__vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
|
||||
register __vector double temp0 = {0, 0};
|
||||
register __vector double temp1 = {0, 0};
|
||||
register __vector double temp2 = {0, 0};
|
||||
register __vector double temp3 = {0, 0};
|
||||
register __vector double temp4 = {0, 0};
|
||||
register __vector double temp5 = {0, 0};
|
||||
register __vector double temp6 = {0, 0};
|
||||
register __vector double temp7 = {0, 0};
|
||||
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
a4 = a3 + lda;
|
||||
a5 = a4 + lda;
|
||||
a6 = a5 + lda;
|
||||
a7 = a6 + lda;
|
||||
va0 = (__vector double*) a0;
|
||||
va1 = (__vector double*) a1;
|
||||
va2 = (__vector double*) a2;
|
||||
va3 = (__vector double*) a3;
|
||||
va4 = (__vector double*) a4;
|
||||
va5 = (__vector double*) a5;
|
||||
va6 = (__vector double*) a6;
|
||||
va7 = (__vector double*) a7;
|
||||
v_x = (__vector double*) x;
|
||||
|
||||
#if defined(PREFETCH)
|
||||
|
||||
c = n >> 1;
|
||||
|
||||
for (j = 0; j < c; j += 64) {
|
||||
k = (c - j) > 64 ? 64 : (c - j);
|
||||
__builtin_prefetch(v_x + 64);
|
||||
__builtin_prefetch(va0 + 64);
|
||||
__builtin_prefetch(va1 + 64);
|
||||
__builtin_prefetch(va2 + 64);
|
||||
__builtin_prefetch(va3 + 64);
|
||||
__builtin_prefetch(va4 + 64);
|
||||
__builtin_prefetch(va5 + 64);
|
||||
__builtin_prefetch(va6 + 64);
|
||||
__builtin_prefetch(va7 + 64);
|
||||
for (i = 0; i < k; i += 2) {
|
||||
#else
|
||||
|
||||
for (i = 0; i < n/2; i += 2) {
|
||||
#endif
|
||||
temp0 += v_x[i] * va0[i];
|
||||
temp1 += v_x[i] * va1[i];
|
||||
temp2 += v_x[i] * va2[i];
|
||||
temp3 += v_x[i] * va3[i];
|
||||
temp4 += v_x[i] * va4[i];
|
||||
temp5 += v_x[i] * va5[i];
|
||||
temp6 += v_x[i] * va6[i];
|
||||
temp7 += v_x[i] * va7[i];
|
||||
temp0 += v_x[i + 1] * va0[i + 1];
|
||||
temp1 += v_x[i + 1] * va1[i + 1];
|
||||
temp2 += v_x[i + 1] * va2[i + 1];
|
||||
temp3 += v_x[i + 1] * va3[i + 1];
|
||||
|
||||
temp4 += v_x[i + 1] * va4[i + 1];
|
||||
temp5 += v_x[i + 1] * va5[i + 1];
|
||||
temp6 += v_x[i + 1] * va6[i + 1];
|
||||
temp7 += v_x[i + 1] * va7[i + 1];
|
||||
}
|
||||
#if defined(PREFETCH)
|
||||
va0 += 64;
|
||||
va1 += 64;
|
||||
va2 += 64;
|
||||
va3 += 64;
|
||||
va4 += 64;
|
||||
va5 += 64;
|
||||
va6 += 64;
|
||||
va7 += 64;
|
||||
v_x += 64;
|
||||
|
||||
}
|
||||
#endif
|
||||
y[0] += alpha * (temp0[0] + temp0[1]);
|
||||
y[1] += alpha * (temp1[0] + temp1[1]);
|
||||
y[2] += alpha * (temp2[0] + temp2[1]);
|
||||
y[3] += alpha * (temp3[0] + temp3[1]);
|
||||
|
||||
y[4] += alpha * (temp4[0] + temp4[1]);
|
||||
y[5] += alpha * (temp5[0] + temp5[1]);
|
||||
y[6] += alpha * (temp6[0] + temp6[1]);
|
||||
y[7] += alpha * (temp7[0] + temp7[1]);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
BLASLONG i = 0;
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
__vector double* va0 = (__vector double*) a0;
|
||||
__vector double* va1 = (__vector double*) a1;
|
||||
__vector double* va2 = (__vector double*) a2;
|
||||
__vector double* va3 = (__vector double*) a3;
|
||||
__vector double* v_x = (__vector double*) x;
|
||||
register __vector double temp0 = {0, 0};
|
||||
register __vector double temp1 = {0, 0};
|
||||
register __vector double temp2 = {0, 0};
|
||||
register __vector double temp3 = {0, 0};
|
||||
register __vector double temp4 = {0, 0};
|
||||
register __vector double temp5 = {0, 0};
|
||||
register __vector double temp6 = {0, 0};
|
||||
register __vector double temp7 = {0, 0};
|
||||
|
||||
for (i = 0; i < n / 2; i += 2) {
|
||||
temp0 += v_x[i] * va0[i];
|
||||
temp1 += v_x[i] * va1[i];
|
||||
temp2 += v_x[i] * va2[i];
|
||||
temp3 += v_x[i] * va3[i];
|
||||
temp4 += v_x[i + 1] * va0[i + 1];
|
||||
temp5 += v_x[i + 1] * va1[i + 1];
|
||||
temp6 += v_x[i + 1] * va2[i + 1];
|
||||
temp7 += v_x[i + 1] * va3[i + 1];
|
||||
}
|
||||
|
||||
temp0 += temp4;
|
||||
temp1 += temp5;
|
||||
temp2 += temp6;
|
||||
temp3 += temp7;
|
||||
y[0] += alpha * (temp0[0] + temp0[1]);
|
||||
y[1] += alpha * (temp1[0] + temp1[1]);
|
||||
y[2] += alpha * (temp2[0] + temp2[1]);
|
||||
y[3] += alpha * (temp3[0] + temp3[1]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
__vector double* va0 = (__vector double*) a0;
|
||||
__vector double* va1 = (__vector double*) a1;
|
||||
__vector double* v_x = (__vector double*) x;
|
||||
__vector double temp0 = {0, 0};
|
||||
__vector double temp1 = {0, 0};
|
||||
for (i = 0; i < n / 2; i += 2) {
|
||||
temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
|
||||
temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1];
|
||||
}
|
||||
|
||||
|
||||
|
||||
y[0] += alpha * (temp0[0] + temp0[1]);
|
||||
y[inc_y] += alpha * (temp1[0] + temp1[1]);
|
||||
}
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
a0 = ap;
|
||||
__vector double* va0 = (__vector double*) a0;
|
||||
__vector double* v_x = (__vector double*) x;
|
||||
__vector double temp0 = {0, 0};
|
||||
for (i = 0; i < n / 2; i += 2) {
|
||||
temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
|
||||
}
|
||||
|
||||
*y += alpha * (temp0[0] + temp0[1]);
|
||||
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++) {
|
||||
*dest++ = *src;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
FLOAT ybuffer[8] __attribute__((aligned(16)));
|
||||
FLOAT *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
xbuffer = buffer;
|
||||
|
||||
n1 = n >> 3;
|
||||
n2 = n & 7;
|
||||
|
||||
m3 = m & 3;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX - 1)) - m3;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if (inc_x != 1)
|
||||
copy_x(NB, x_ptr, xbuffer, inc_x);
|
||||
else
|
||||
xbuffer = x_ptr;
|
||||
|
||||
BLASLONG lda8 = lda << 3;
|
||||
|
||||
|
||||
if (inc_y == 1) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
|
||||
dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
|
||||
|
||||
y_ptr += 8;
|
||||
a_ptr += lda8;
|
||||
#if defined(PREFETCH)
|
||||
__builtin_prefetch(y_ptr+64);
|
||||
#endif
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
ybuffer[0] = 0;
|
||||
ybuffer[1] = 0;
|
||||
ybuffer[2] = 0;
|
||||
ybuffer[3] = 0;
|
||||
ybuffer[4] = 0;
|
||||
ybuffer[5] = 0;
|
||||
ybuffer[6] = 0;
|
||||
ybuffer[7] = 0;
|
||||
dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
|
||||
|
||||
|
||||
|
||||
*y_ptr += ybuffer[0];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[2];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
|
||||
*y_ptr += ybuffer[4];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[5];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[6];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[7];
|
||||
y_ptr += inc_y;
|
||||
|
||||
a_ptr += lda8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (n2 & 4) {
|
||||
ybuffer[0] = 0;
|
||||
ybuffer[1] = 0;
|
||||
ybuffer[2] = 0;
|
||||
ybuffer[3] = 0;
|
||||
dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
|
||||
|
||||
a_ptr += lda<<2;
|
||||
|
||||
*y_ptr += ybuffer[0];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[2];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
|
||||
a_ptr += lda << 1;
|
||||
y_ptr += 2 * inc_y;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
x += NB * inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
x_ptr = x;
|
||||
a_ptr = a;
|
||||
if (m3 == 3) {
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp2 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if (lda == 3 && inc_y == 1) {
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
|
||||
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
|
||||
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
|
||||
aj += 12;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
aj += 3;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if (lda == 2 && inc_y == 1) {
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
|
||||
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
|
||||
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
|
||||
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
|
||||
aj += 8;
|
||||
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
|
||||
aj += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
FLOAT xtemp = *x_ptr * alpha;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
if (lda == 1 && inc_y == 1) {
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
y_ptr[j + 1] += aj[j + 1] * xtemp;
|
||||
y_ptr[j + 2] += aj[j + 2] * xtemp;
|
||||
y_ptr[j + 3] += aj[j + 3] * xtemp;
|
||||
}
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
@@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "drot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dscal_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(HAVE_KERNEL_8)
|
||||
|
||||
|
||||
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dswap_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
/*
|
||||
.file "icamax.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
@@ -5,6 +6,12 @@
|
||||
.p2align 4,,15
|
||||
.globl icamax_k
|
||||
.type icamax_k, @function
|
||||
*/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
icamax_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
/*
|
||||
.file "icamin.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
@@ -5,6 +6,12 @@
|
||||
.p2align 4,,15
|
||||
.globl icamin_k
|
||||
.type icamin_k, @function
|
||||
*/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
icamin_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
|
||||
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
@@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
@@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
return index;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
@@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
if (n1 > 0) {
|
||||
|
||||
max = diamax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
|
||||
@@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
@@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
return index;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
@@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
if (inc_x == 1) {
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = diamin_kernel_32(n1, x, &minf);
|
||||
i = n1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
/*
|
||||
.file "isamax.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
@@ -5,6 +6,12 @@
|
||||
.p2align 4,,15
|
||||
.globl isamax_k
|
||||
.type isamax_k, @function
|
||||
*/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
isamax_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
/*
|
||||
.file "isamin.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
@@ -5,6 +6,12 @@
|
||||
.p2align 4,,15
|
||||
.globl isamin_k
|
||||
.type isamin_k, @function
|
||||
*/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
isamin_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
|
||||
@@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
@@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
if (inc_x == 1) {
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
@@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
|
||||
@@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
@@ -32,6 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ABS fabs
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
@@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
return index;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
minf = CABS1(x,0); //index will not be incremented
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
@@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
@@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "sasum_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
@@ -28,8 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define offset_0 0
|
||||
#define offset_1 16
|
||||
#define offset_2 32
|
||||
#define offset_3 48
|
||||
#define offset_4 64
|
||||
#define offset_5 80
|
||||
#define offset_6 96
|
||||
#define offset_7 112
|
||||
#define offset_8 128
|
||||
#define offset_9 144
|
||||
#define offset_10 160
|
||||
#define offset_11 176
|
||||
#define offset_12 192
|
||||
#define offset_13 208
|
||||
#define offset_14 224
|
||||
#define offset_15 240
|
||||
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
#include <altivec.h>
|
||||
@@ -37,12 +54,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
__vector float v_a = {alpha,alpha,alpha,alpha};
|
||||
__vector float * v_y=(__vector float *)y;
|
||||
__vector float * v_x=(__vector float *)x;
|
||||
__vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha};
|
||||
__vector float * vptr_y =(__vector float *)y;
|
||||
__vector float * vptr_x =(__vector float *)x;
|
||||
|
||||
for(; i<n/4; i+=16){
|
||||
|
||||
|
||||
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
|
||||
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
|
||||
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
|
||||
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
|
||||
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
|
||||
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
|
||||
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
|
||||
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
|
||||
register __vector float vy_8 = vec_vsx_ld( offset_8 ,vptr_y ) ;
|
||||
register __vector float vy_9 = vec_vsx_ld( offset_9 ,vptr_y ) ;
|
||||
register __vector float vy_10 = vec_vsx_ld( offset_10 ,vptr_y ) ;
|
||||
register __vector float vy_11 = vec_vsx_ld( offset_11 ,vptr_y ) ;
|
||||
register __vector float vy_12 = vec_vsx_ld( offset_12 ,vptr_y ) ;
|
||||
register __vector float vy_13 = vec_vsx_ld( offset_13 ,vptr_y ) ;
|
||||
register __vector float vy_14 = vec_vsx_ld( offset_14 ,vptr_y ) ;
|
||||
register __vector float vy_15 = vec_vsx_ld( offset_15 ,vptr_y ) ;
|
||||
|
||||
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
|
||||
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
|
||||
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
|
||||
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
|
||||
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
|
||||
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
|
||||
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
|
||||
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
|
||||
register __vector float vx_8 = vec_vsx_ld( offset_8 ,vptr_x ) ;
|
||||
register __vector float vx_9 = vec_vsx_ld( offset_9 ,vptr_x ) ;
|
||||
register __vector float vx_10 = vec_vsx_ld( offset_10 ,vptr_x ) ;
|
||||
register __vector float vx_11 = vec_vsx_ld( offset_11 ,vptr_x ) ;
|
||||
register __vector float vx_12 = vec_vsx_ld( offset_12 ,vptr_x ) ;
|
||||
register __vector float vx_13 = vec_vsx_ld( offset_13 ,vptr_x ) ;
|
||||
register __vector float vx_14 = vec_vsx_ld( offset_14 ,vptr_x ) ;
|
||||
register __vector float vx_15 = vec_vsx_ld( offset_15 ,vptr_x ) ;
|
||||
vy_0 += vx_0*v_a;
|
||||
vy_1 += vx_1*v_a;
|
||||
vy_2 += vx_2*v_a;
|
||||
vy_3 += vx_3*v_a;
|
||||
vy_4 += vx_4*v_a;
|
||||
vy_5 += vx_5*v_a;
|
||||
vy_6 += vx_6*v_a;
|
||||
vy_7 += vx_7*v_a;
|
||||
vy_8 += vx_8*v_a;
|
||||
vy_9 += vx_9*v_a;
|
||||
vy_10 += vx_10*v_a;
|
||||
vy_11 += vx_11*v_a;
|
||||
vy_12 += vx_12*v_a;
|
||||
vy_13 += vx_13*v_a;
|
||||
vy_14 += vx_14*v_a;
|
||||
vy_15 += vx_15*v_a;
|
||||
|
||||
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_8, offset_8 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_9, offset_9 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_10, offset_10 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_11, offset_11 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_12, offset_12 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_13, offset_13 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_14, offset_14 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_15, offset_15 ,vptr_y ) ;
|
||||
|
||||
vptr_x+=16;
|
||||
vptr_y+=16;
|
||||
|
||||
/*
|
||||
|
||||
v_y[i] += v_a * v_x[i];
|
||||
v_y[i+1] += v_a * v_x[i+1];
|
||||
v_y[i+2] += v_a * v_x[i+2];
|
||||
@@ -59,9 +149,11 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
v_y[i+13] += v_a * v_x[i+13];
|
||||
v_y[i+14] += v_a * v_x[i+14];
|
||||
v_y[i+15] += v_a * v_x[i+15];
|
||||
*/
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
@@ -74,11 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
if ( n1 )
|
||||
saxpy_kernel_64(n1, x, y, da);
|
||||
|
||||
i = n1;
|
||||
#endif
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
|
||||
959
kernel/power/sbgemm_kernel_power10.c
Normal file
959
kernel/power/sbgemm_kernel_power10.c
Normal file
@@ -0,0 +1,959 @@
|
||||
/*********************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION)
|
||||
static float
|
||||
bfloat16tof32 (bfloat16 f16)
|
||||
{
|
||||
float result = 0;
|
||||
unsigned short *q = (unsigned short *) (&result);
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
q[0] = f16;
|
||||
#else
|
||||
q[1] = f16;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
#define BF16TOF32(x) (bfloat16tof32(x))
|
||||
#else
|
||||
#define BF16TOF32(x) x
|
||||
#endif
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||
|
||||
vector char mask =
|
||||
{ 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe,
|
||||
0xf
|
||||
};
|
||||
|
||||
/*
|
||||
* BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of
|
||||
* bfloat16 floating-point values as input. Hence this
|
||||
* merging is needed on A and B matrices.
|
||||
*/
|
||||
#define MERGE_ROW(x) vec_perm(x, x, mask)
|
||||
#define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y)
|
||||
#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y)
|
||||
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE4x2_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
||||
rowC[0] += result[4] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||
rowC[0] += result[6] * alpha;
|
||||
#define SAVE4x2_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||
rowC[0] += result[4] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
||||
rowC[0] += result[6] * alpha;
|
||||
|
||||
#define MMA __builtin_mma_xvbf16ger2pp
|
||||
|
||||
#define SAVE2x4_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] += result[1] * alpha;
|
||||
|
||||
#define SET_ACC_ZERO4() \
|
||||
__builtin_mma_xxsetaccz (&acc0); \
|
||||
__builtin_mma_xxsetaccz (&acc1); \
|
||||
__builtin_mma_xxsetaccz (&acc2); \
|
||||
__builtin_mma_xxsetaccz (&acc3);
|
||||
|
||||
#define SET_ACC_ZERO8() \
|
||||
__builtin_mma_xxsetaccz (&acc0); \
|
||||
__builtin_mma_xxsetaccz (&acc1); \
|
||||
__builtin_mma_xxsetaccz (&acc2); \
|
||||
__builtin_mma_xxsetaccz (&acc3); \
|
||||
__builtin_mma_xxsetaccz (&acc4); \
|
||||
__builtin_mma_xxsetaccz (&acc5); \
|
||||
__builtin_mma_xxsetaccz (&acc6); \
|
||||
__builtin_mma_xxsetaccz (&acc7);
|
||||
|
||||
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
||||
/*************************************************************************************
|
||||
* SBGEMM Kernel
|
||||
*************************************************************************************/
|
||||
int
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
|
||||
IFLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||
{
|
||||
BLASLONG i1;
|
||||
v4sf_t valpha = { alpha, alpha, alpha, alpha };
|
||||
vector short vzero = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
/* Loop for n >= 8. */
|
||||
for (i1 = 0; i1 < (n >> 3); i1++)
|
||||
{
|
||||
BLASLONG j;
|
||||
FLOAT *CO;
|
||||
IFLOAT *AO;
|
||||
CO = C;
|
||||
C += ldc << 3;
|
||||
AO = A;
|
||||
PREFETCH1 (A, 128);
|
||||
PREFETCH1 (A, 256);
|
||||
/* Loop for m >= 16. */
|
||||
for (j = 0; j < (m >> 4); j++)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
SET_ACC_ZERO8 ();
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 5]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 4]);
|
||||
MMA (&acc0, rowB[0], rowA[0]);
|
||||
MMA (&acc1, rowB[1], rowA[0]);
|
||||
MMA (&acc2, rowB[0], rowA[1]);
|
||||
MMA (&acc3, rowB[1], rowA[1]);
|
||||
MMA (&acc4, rowB[0], rowA[2]);
|
||||
MMA (&acc5, rowB[1], rowA[2]);
|
||||
MMA (&acc6, rowB[0], rowA[3]);
|
||||
MMA (&acc7, rowB[1], rowA[3]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 4;
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 1]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l]);
|
||||
vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
|
||||
vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
|
||||
vec_t rowA_h = MERGE_HIGH (rowA[0], vzero);
|
||||
vec_t rowA_l = MERGE_LOW (rowA[0], vzero);
|
||||
vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero);
|
||||
vec_t rowA2_l = MERGE_LOW (rowA[1], vzero);
|
||||
MMA (&acc0, rowB_h, rowA_h);
|
||||
MMA (&acc1, rowB_l, rowA_h);
|
||||
MMA (&acc2, rowB_h, rowA_l);
|
||||
MMA (&acc3, rowB_l, rowA_l);
|
||||
MMA (&acc4, rowB_h, rowA2_h);
|
||||
MMA (&acc5, rowB_l, rowA2_h);
|
||||
MMA (&acc6, rowB_h, rowA2_l);
|
||||
MMA (&acc7, rowB_l, rowA2_l);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC (&acc2, 4);
|
||||
SAVE_ACC1 (&acc1, 0);
|
||||
SAVE_ACC1 (&acc3, 4);
|
||||
SAVE_ACC (&acc4, 8);
|
||||
SAVE_ACC (&acc6, 12);
|
||||
SAVE_ACC1 (&acc5, 8);
|
||||
SAVE_ACC1 (&acc7, 12);
|
||||
CO += 16;
|
||||
|
||||
AO += (k << 4);
|
||||
BO += (k << 3);
|
||||
}
|
||||
if (m & 8)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
SET_ACC_ZERO4 ();
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 4]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 4]);
|
||||
|
||||
MMA (&acc0, rowB[0], rowA[0]);
|
||||
MMA (&acc1, rowB[1], rowA[0]);
|
||||
MMA (&acc2, rowB[0], rowA[1]);
|
||||
MMA (&acc3, rowB[1], rowA[1]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 4;
|
||||
vec_t *rowA = (vec_t *) & (AO[l]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l]);
|
||||
vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
|
||||
vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
|
||||
vec_t rowA_h = MERGE_HIGH (rowA[0], vzero);
|
||||
vec_t rowA_l = MERGE_LOW (rowA[0], vzero);
|
||||
MMA (&acc0, rowB_h, rowA_h);
|
||||
MMA (&acc1, rowB_l, rowA_h);
|
||||
MMA (&acc2, rowB_h, rowA_l);
|
||||
MMA (&acc3, rowB_l, rowA_l);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC (&acc2, 4);
|
||||
SAVE_ACC1 (&acc1, 0);
|
||||
SAVE_ACC1 (&acc3, 4);
|
||||
CO += 8;
|
||||
AO += (k << 3);
|
||||
BO += (k << 3);
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
__builtin_mma_xxsetaccz (&acc1);
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 3]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 4]);
|
||||
MMA (&acc0, rowB[0], rowA[0]);
|
||||
MMA (&acc1, rowB[1], rowA[0]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 3;
|
||||
vector short rowA =
|
||||
{ AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 };
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 1]);
|
||||
MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
|
||||
MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC1 (&acc1, 0);
|
||||
CO += 4;
|
||||
AO += (k << 2);
|
||||
BO += (k << 3);
|
||||
}
|
||||
if (m & 2)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v2sf_t *rowC;
|
||||
v2sf_t result[8];
|
||||
__vector_quad acc0, acc1;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
__builtin_mma_xxsetaccz (&acc1);
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowA =
|
||||
{ AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1],
|
||||
AO[(l << 2) + 3],
|
||||
0, 0, 0, 0
|
||||
};
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 4]);
|
||||
MMA (&acc0, rowB[0], (vec_t) rowA);
|
||||
MMA (&acc1, rowB[1], (vec_t) rowA);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 2;
|
||||
vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 };
|
||||
vec_t *rowB = (vec_t *) & (BO[(l << 2)]);
|
||||
MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
|
||||
MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
|
||||
}
|
||||
SAVE4x2_ACC (&acc0, 0);
|
||||
SAVE4x2_ACC1 (&acc1, 0);
|
||||
CO += 2;
|
||||
AO += (k << 1);
|
||||
BO += (k << 3);
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v2sf_t *rowC;
|
||||
v2sf_t result[8];
|
||||
__vector_quad acc0, acc1;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
__builtin_mma_xxsetaccz (&acc1);
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowA =
|
||||
{ AO[(l << 1) + 0], AO[(l << 1) + 1], 0, 0, 0, 0, 0, 0};
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 4]);
|
||||
MMA (&acc0, rowB[0], (vec_t) rowA);
|
||||
MMA (&acc1, rowB[1], (vec_t) rowA);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 1;
|
||||
vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 };
|
||||
vec_t *rowB = (vec_t *) & (BO[(l << 3)]);
|
||||
MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
|
||||
MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
|
||||
}
|
||||
SAVE4x2_ACC (&acc0, 0);
|
||||
SAVE4x2_ACC1 (&acc1, 0);
|
||||
CO += 1;
|
||||
AO += k;
|
||||
BO += (k << 3);
|
||||
}
|
||||
B += k << 3;
|
||||
}
|
||||
if (n & 4)
|
||||
{
|
||||
BLASLONG j;
|
||||
FLOAT *CO;
|
||||
IFLOAT *AO;
|
||||
CO = C;
|
||||
C += ldc << 2;
|
||||
AO = A;
|
||||
/* Loop for m >= 32. */
|
||||
for (j = 0; j < (m >> 5); j++)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
IFLOAT *A1 = AO + (16 * k);
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
SET_ACC_ZERO8 ();
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 5]);
|
||||
vec_t *rowA1 = (vec_t *) & (A1[l << 5]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 3]);
|
||||
MMA (&acc0, rowB[0], rowA[0]);
|
||||
MMA (&acc1, rowB[0], rowA[1]);
|
||||
MMA (&acc2, rowB[0], rowA[2]);
|
||||
MMA (&acc3, rowB[0], rowA[3]);
|
||||
MMA (&acc4, rowB[0], rowA1[0]);
|
||||
MMA (&acc5, rowB[0], rowA1[1]);
|
||||
MMA (&acc6, rowB[0], rowA1[2]);
|
||||
MMA (&acc7, rowB[0], rowA1[3]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 3;
|
||||
vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
|
||||
vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l]);
|
||||
vec_t rowB_mrg = MERGE_ROW (rowB[0]);
|
||||
MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
|
||||
MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
|
||||
MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero));
|
||||
MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero));
|
||||
MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero));
|
||||
MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero));
|
||||
MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero));
|
||||
MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero));
|
||||
}
|
||||
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC (&acc1, 4);
|
||||
CO += 8;
|
||||
SAVE_ACC (&acc2, 0);
|
||||
SAVE_ACC (&acc3, 4);
|
||||
CO += 8;
|
||||
SAVE_ACC (&acc4, 0);
|
||||
SAVE_ACC (&acc5, 4);
|
||||
CO += 8;
|
||||
SAVE_ACC (&acc6, 0);
|
||||
SAVE_ACC (&acc7, 4);
|
||||
CO += 8;
|
||||
AO += k << 5;
|
||||
BO += k << 2;
|
||||
}
|
||||
if (m & 16)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
SET_ACC_ZERO4 ();
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 5]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 3]);
|
||||
MMA (&acc0, rowB[0], rowA[0]);
|
||||
MMA (&acc1, rowB[0], rowA[1]);
|
||||
MMA (&acc2, rowB[0], rowA[2]);
|
||||
MMA (&acc3, rowB[0], rowA[3]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 3;
|
||||
vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l]);
|
||||
vec_t rowB_mrg = MERGE_ROW (rowB[0]);
|
||||
MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
|
||||
MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
|
||||
MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero));
|
||||
MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero));
|
||||
}
|
||||
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC (&acc1, 4);
|
||||
CO += 8;
|
||||
SAVE_ACC (&acc2, 0);
|
||||
SAVE_ACC (&acc3, 4);
|
||||
CO += 8;
|
||||
AO += k << 4;
|
||||
BO += k << 2;
|
||||
}
|
||||
if (m & 8)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
__builtin_mma_xxsetaccz (&acc1);
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 4]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 3]);
|
||||
MMA (&acc0, rowB[0], rowA[0]);
|
||||
MMA (&acc1, rowB[0], rowA[1]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 3;
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 1]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l]);
|
||||
vec_t rowB_mrg = MERGE_ROW (rowB[0]);
|
||||
MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
|
||||
MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC (&acc1, 4);
|
||||
CO += 8;
|
||||
AO += k << 3;
|
||||
BO += k << 2;
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
__vector_quad acc0;
|
||||
v4sf_t result[4];
|
||||
BLASLONG l = 0;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 3]);
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 3]);
|
||||
MMA (&acc0, rowB[0], rowA[0]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 3;
|
||||
vector short rowA =
|
||||
{ AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 };
|
||||
vec_t *rowB = (vec_t *) & (BO[l]);
|
||||
MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
CO += 4;
|
||||
AO += k << 2;
|
||||
BO += k << 2;
|
||||
}
|
||||
if (m & 2)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v2sf_t *rowC;
|
||||
v2sf_t result[8];
|
||||
__vector_quad acc0;
|
||||
BLASLONG l = 0;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowA =
|
||||
{ AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1],
|
||||
AO[(l << 2) + 3],
|
||||
0, 0, 0, 0
|
||||
};
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 3]);
|
||||
MMA (&acc0, rowB[0], (vec_t) rowA);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 2;
|
||||
vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 };
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 1]);
|
||||
MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
|
||||
}
|
||||
SAVE4x2_ACC (&acc0, 0);
|
||||
CO += 2;
|
||||
AO += k << 1;
|
||||
BO += k << 2;
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v2sf_t *rowC;
|
||||
v2sf_t result[8];
|
||||
__vector_quad acc0;
|
||||
BLASLONG l = 0;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowA =
|
||||
{ AO[(l << 1) + 0], AO[(l << 1) + 1], 0,
|
||||
0, 0, 0, 0
|
||||
};
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 3]);
|
||||
MMA (&acc0, rowB[0], (vec_t) rowA);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 1;
|
||||
vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 };
|
||||
vec_t *rowB = (vec_t *) & (BO[l << 2]);
|
||||
MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
|
||||
}
|
||||
SAVE4x2_ACC (&acc0, 0);
|
||||
AO += k;
|
||||
BO += (k << 2);
|
||||
CO += 1;
|
||||
}
|
||||
|
||||
B += k << 2;
|
||||
}
|
||||
if (n & 2)
|
||||
{
|
||||
BLASLONG j;
|
||||
FLOAT *CO;
|
||||
IFLOAT *AO;
|
||||
CO = C;
|
||||
C += ldc << 1;
|
||||
AO = A;
|
||||
/* Loop for m >= 32. */
|
||||
for (j = 0; j < (m >> 5); j++)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
IFLOAT *A1 = AO + (16 * k);
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
SET_ACC_ZERO8 ();
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowB =
|
||||
{ BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1],
|
||||
BO[(l << 2) + 3],
|
||||
0, 0, 0, 0
|
||||
};
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 5]);
|
||||
vec_t *rowA1 = (vec_t *) & (A1[l << 5]);
|
||||
MMA (&acc0, (vec_t) rowB, rowA[0]);
|
||||
MMA (&acc1, (vec_t) rowB, rowA[1]);
|
||||
MMA (&acc2, (vec_t) rowB, rowA[2]);
|
||||
MMA (&acc3, (vec_t) rowB, rowA[3]);
|
||||
MMA (&acc4, (vec_t) rowB, rowA1[0]);
|
||||
MMA (&acc5, (vec_t) rowB, rowA1[1]);
|
||||
MMA (&acc6, (vec_t) rowB, rowA1[2]);
|
||||
MMA (&acc7, (vec_t) rowB, rowA1[3]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 2;
|
||||
vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 3]);
|
||||
vec_t *rowA1 = (vec_t *) & (A1[l << 3]);
|
||||
MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
|
||||
MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
|
||||
MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
|
||||
MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
|
||||
MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2]));
|
||||
MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2]));
|
||||
MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3]));
|
||||
MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3]));
|
||||
}
|
||||
SAVE2x4_ACC (&acc0, 0);
|
||||
SAVE2x4_ACC (&acc1, 4);
|
||||
SAVE2x4_ACC (&acc2, 8);
|
||||
SAVE2x4_ACC (&acc3, 12);
|
||||
CO += 16;
|
||||
SAVE2x4_ACC (&acc4, 0);
|
||||
SAVE2x4_ACC (&acc5, 4);
|
||||
SAVE2x4_ACC (&acc6, 8);
|
||||
SAVE2x4_ACC (&acc7, 12);
|
||||
CO += 16;
|
||||
AO += k << 5;
|
||||
BO += k << 1;
|
||||
}
|
||||
if (m & 16)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
SET_ACC_ZERO4 ();
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowB =
|
||||
{ BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1],
|
||||
BO[(l << 2) + 3],
|
||||
0, 0, 0, 0
|
||||
};
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 5]);
|
||||
MMA (&acc0, (vec_t) rowB, rowA[0]);
|
||||
MMA (&acc1, (vec_t) rowB, rowA[1]);
|
||||
MMA (&acc2, (vec_t) rowB, rowA[2]);
|
||||
MMA (&acc3, (vec_t) rowB, rowA[3]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 2;
|
||||
vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 3]);
|
||||
MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
|
||||
MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
|
||||
MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
|
||||
MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
|
||||
}
|
||||
SAVE2x4_ACC (&acc0, 0);
|
||||
SAVE2x4_ACC (&acc1, 4);
|
||||
SAVE2x4_ACC (&acc2, 8);
|
||||
SAVE2x4_ACC (&acc3, 12);
|
||||
CO += 16;
|
||||
AO += k << 4;
|
||||
BO += k << 1;
|
||||
}
|
||||
if (m & 8)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
__builtin_mma_xxsetaccz (&acc1);
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowB =
|
||||
{ BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1],
|
||||
BO[(l << 2) + 3],
|
||||
0, 0, 0, 0
|
||||
};
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 4]);
|
||||
MMA (&acc0, (vec_t) rowB, rowA[0]);
|
||||
MMA (&acc1, (vec_t) rowB, rowA[1]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 2;
|
||||
vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
|
||||
vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
|
||||
MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1]));
|
||||
MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1]));
|
||||
}
|
||||
SAVE2x4_ACC (&acc0, 0);
|
||||
SAVE2x4_ACC (&acc1, 4);
|
||||
CO += 8;
|
||||
AO += k << 3;
|
||||
BO += k << 1;
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowB =
|
||||
{ BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1],
|
||||
BO[(l << 2) + 3],
|
||||
0, 0, 0, 0
|
||||
};
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 3]);
|
||||
MMA (&acc0, (vec_t) rowB, rowA[0]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 2;
|
||||
vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 1]);
|
||||
MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0]));
|
||||
}
|
||||
SAVE2x4_ACC (&acc0, 0);
|
||||
CO += 4;
|
||||
AO += k << 2;
|
||||
BO += k << 1;
|
||||
}
|
||||
if (m & 2)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0, 0, 0 };
|
||||
for (l = 0; l < (k << 1); l += 2)
|
||||
{
|
||||
v4sf_t rowA =
|
||||
{ BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l + 1]),
|
||||
BF16TOF32 (AO[l + 1])
|
||||
};
|
||||
v4sf_t rowB =
|
||||
{ BF16TOF32 (BO[l]), BF16TOF32 (BO[l + 1]), BF16TOF32 (BO[l]),
|
||||
BF16TOF32 (BO[l + 1])
|
||||
};
|
||||
t += rowA * rowB;
|
||||
}
|
||||
t = t * valpha;
|
||||
CO[0 * ldc] += t[0];
|
||||
CO[1 * ldc] += t[1];
|
||||
CO[0 * ldc + 1] += t[2];
|
||||
CO[1 * ldc + 1] += t[3];
|
||||
CO += 2;
|
||||
AO += k << 1;
|
||||
BO += k << 1;
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0, 0, 0 };
|
||||
for (l = 0; l < k; l++)
|
||||
{
|
||||
v4sf_t rowA = { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), 0, 0 };
|
||||
v4sf_t rowB =
|
||||
{ BF16TOF32 (BO[l << 1]), BF16TOF32 (BO[(l << 1) + 1]), 0,
|
||||
0
|
||||
};
|
||||
t += rowA * rowB;
|
||||
}
|
||||
CO[0 * ldc] += t[0] * alpha;
|
||||
CO[1 * ldc] += t[1] * alpha;
|
||||
CO += 1;
|
||||
AO += k;
|
||||
BO += k << 1;
|
||||
}
|
||||
B += k << 1;
|
||||
}
|
||||
if (n & 1)
|
||||
{
|
||||
BLASLONG j;
|
||||
FLOAT *CO;
|
||||
IFLOAT *AO;
|
||||
CO = C;
|
||||
C += ldc;
|
||||
AO = A;
|
||||
/* Loop for m >= 16. */
|
||||
for (j = 0; j < (m >> 4); j++)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
SET_ACC_ZERO4 ();
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowB =
|
||||
{ BO[l << 1], BO[(l << 1) + 1], 0, 0, 0, 0, 0, 0};
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 5]);
|
||||
MMA (&acc0, (vec_t) rowB, rowA[0]);
|
||||
MMA (&acc1, (vec_t) rowB, rowA[1]);
|
||||
MMA (&acc2, (vec_t) rowB, rowA[2]);
|
||||
MMA (&acc3, (vec_t) rowB, rowA[3]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 1;
|
||||
vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 };
|
||||
vec_t *rowA = (vec_t *) & (AO[(l << 4)]);
|
||||
MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
|
||||
MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
|
||||
MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
|
||||
MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
|
||||
}
|
||||
rowC = (v4sf_t *) &CO[0];
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc0);
|
||||
rowC[0] += result[0] * alpha;
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc1);
|
||||
rowC[1] += result[0] * alpha;
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc2);
|
||||
rowC[2] += result[0] * alpha;
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc3);
|
||||
rowC[3] += result[0] * alpha;
|
||||
AO += k << 4;
|
||||
BO += k;
|
||||
CO += 16;
|
||||
}
|
||||
/* Loop for m >= 8. */
|
||||
if (m & 8)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
__builtin_mma_xxsetaccz (&acc1);
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowB =
|
||||
{ BO[l << 1], BO[(l << 1) + 1], 0, 0, 0, 0, 0, 0};
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 4]);
|
||||
MMA (&acc0, (vec_t) rowB, rowA[0]);
|
||||
MMA (&acc1, (vec_t) rowB, rowA[1]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 1;
|
||||
vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 };
|
||||
vec_t *rowA = (vec_t *) & (AO[(l << 3)]);
|
||||
MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1]));
|
||||
MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1]));
|
||||
}
|
||||
rowC = (v4sf_t *) &CO[0];
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc0);
|
||||
rowC[0] += result[0] * alpha;
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc1);
|
||||
rowC[1] += result[0] * alpha;
|
||||
AO += k << 3;
|
||||
BO += k;
|
||||
CO += 8;
|
||||
}
|
||||
/* Loop for m >= 4. */
|
||||
if (m & 4)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0;
|
||||
__builtin_mma_xxsetaccz (&acc0);
|
||||
BLASLONG l = 0;
|
||||
for (l = 0; l < k / 2; l++)
|
||||
{
|
||||
vector short rowB =
|
||||
{ BO[l << 1], BO[(l << 1) + 1], 0, 0, 0, 0, 0, 0};
|
||||
vec_t *rowA = (vec_t *) & (AO[l << 3]);
|
||||
MMA (&acc0, (vec_t) rowB, rowA[0]);
|
||||
}
|
||||
if (k % 2 == 1)
|
||||
{
|
||||
if (k > 1)
|
||||
l = (k / 2) << 1;
|
||||
vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 };
|
||||
vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
|
||||
MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0]));
|
||||
}
|
||||
rowC = (v4sf_t *) &CO[0];
|
||||
__builtin_mma_disassemble_acc ((void *)result, &acc0);
|
||||
rowC[0] += result[0] * alpha;
|
||||
AO += k << 2;
|
||||
BO += k;
|
||||
CO += 4;
|
||||
}
|
||||
/* Loop for m >= 2. */
|
||||
if (m & 2)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0, 0, 0 };
|
||||
for (l = 0; l < k; l++)
|
||||
{
|
||||
v4sf_t rowB = { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), 0, 0 };
|
||||
v4sf_t rowA =
|
||||
{ BF16TOF32 (AO[l << 1]), BF16TOF32 (AO[(l << 1) + 1]), 0,
|
||||
0
|
||||
};
|
||||
t += rowA * rowB;
|
||||
}
|
||||
t = t * valpha;
|
||||
CO[0] += t[0];
|
||||
CO[1] += t[1];
|
||||
AO += k << 1;
|
||||
BO += k;
|
||||
CO += 2;
|
||||
}
|
||||
/* Loop for m = 1. */
|
||||
if (m & 1)
|
||||
{
|
||||
IFLOAT *BO = B;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t = 0;
|
||||
for (l = 0; l < k; l++)
|
||||
{
|
||||
t += BF16TOF32 (AO[l]) * BF16TOF32 (BO[l]);
|
||||
}
|
||||
AO += k;
|
||||
BO += k;
|
||||
CO[0] += t * alpha;
|
||||
CO += 1;
|
||||
}
|
||||
|
||||
B += k;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
437
kernel/power/sbgemm_ncopy_16_power10.c
Normal file
437
kernel/power/sbgemm_ncopy_16_power10.c
Normal file
@@ -0,0 +1,437 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
|
||||
IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
|
||||
|
||||
IFLOAT *boffset;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = (n >> 4);
|
||||
if (j > 0){
|
||||
do{
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset9 = aoffset8 + lda;
|
||||
aoffset10 = aoffset9 + lda;
|
||||
aoffset11 = aoffset10 + lda;
|
||||
aoffset12 = aoffset11 + lda;
|
||||
aoffset13 = aoffset12 + lda;
|
||||
aoffset14 = aoffset13 + lda;
|
||||
aoffset15 = aoffset14 + lda;
|
||||
aoffset16 = aoffset15 + lda;
|
||||
aoffset += 16 * lda;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
|
||||
ctemp05 = *(aoffset3 + 0);
|
||||
ctemp06 = *(aoffset3 + 1);
|
||||
ctemp07 = *(aoffset4 + 0);
|
||||
ctemp08 = *(aoffset4 + 1);
|
||||
|
||||
ctemp09 = *(aoffset5 + 0);
|
||||
ctemp10 = *(aoffset5 + 1);
|
||||
ctemp11 = *(aoffset6 + 0);
|
||||
ctemp12 = *(aoffset6 + 1);
|
||||
|
||||
ctemp13 = *(aoffset7 + 0);
|
||||
ctemp14 = *(aoffset7 + 1);
|
||||
ctemp15 = *(aoffset8 + 0);
|
||||
ctemp16 = *(aoffset8 + 1);
|
||||
|
||||
ctemp17 = *(aoffset9 + 0);
|
||||
ctemp18 = *(aoffset9 + 1);
|
||||
ctemp19 = *(aoffset10 + 0);
|
||||
ctemp20 = *(aoffset10 + 1);
|
||||
|
||||
ctemp21 = *(aoffset11 + 0);
|
||||
ctemp22 = *(aoffset11 + 1);
|
||||
ctemp23 = *(aoffset12 + 0);
|
||||
ctemp24 = *(aoffset12 + 1);
|
||||
|
||||
ctemp25 = *(aoffset13 + 0);
|
||||
ctemp26 = *(aoffset13 + 1);
|
||||
ctemp27 = *(aoffset14 + 0);
|
||||
ctemp28 = *(aoffset14 + 1);
|
||||
|
||||
ctemp29 = *(aoffset15 + 0);
|
||||
ctemp30 = *(aoffset15 + 1);
|
||||
ctemp31 = *(aoffset16 + 0);
|
||||
ctemp32 = *(aoffset16 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
*(boffset + 2) = ctemp03;
|
||||
*(boffset + 3) = ctemp04;
|
||||
*(boffset + 4) = ctemp05;
|
||||
*(boffset + 5) = ctemp06;
|
||||
*(boffset + 6) = ctemp07;
|
||||
*(boffset + 7) = ctemp08;
|
||||
|
||||
*(boffset + 8) = ctemp09;
|
||||
*(boffset + 9) = ctemp10;
|
||||
*(boffset + 10) = ctemp11;
|
||||
*(boffset + 11) = ctemp12;
|
||||
*(boffset + 12) = ctemp13;
|
||||
*(boffset + 13) = ctemp14;
|
||||
*(boffset + 14) = ctemp15;
|
||||
*(boffset + 15) = ctemp16;
|
||||
|
||||
*(boffset + 16) = ctemp17;
|
||||
*(boffset + 17) = ctemp18;
|
||||
*(boffset + 18) = ctemp19;
|
||||
*(boffset + 19) = ctemp20;
|
||||
*(boffset + 20) = ctemp21;
|
||||
*(boffset + 21) = ctemp22;
|
||||
*(boffset + 22) = ctemp23;
|
||||
*(boffset + 23) = ctemp24;
|
||||
|
||||
*(boffset + 24) = ctemp25;
|
||||
*(boffset + 25) = ctemp26;
|
||||
*(boffset + 26) = ctemp27;
|
||||
*(boffset + 27) = ctemp28;
|
||||
*(boffset + 28) = ctemp29;
|
||||
*(boffset + 29) = ctemp30;
|
||||
*(boffset + 30) = ctemp31;
|
||||
*(boffset + 31) = ctemp32;
|
||||
|
||||
aoffset1 += 2;
|
||||
aoffset2 += 2;
|
||||
aoffset3 += 2;
|
||||
aoffset4 += 2;
|
||||
aoffset5 += 2;
|
||||
aoffset6 += 2;
|
||||
aoffset7 += 2;
|
||||
aoffset8 += 2;
|
||||
|
||||
aoffset9 += 2;
|
||||
aoffset10 += 2;
|
||||
aoffset11 += 2;
|
||||
aoffset12 += 2;
|
||||
aoffset13 += 2;
|
||||
aoffset14 += 2;
|
||||
aoffset15 += 2;
|
||||
aoffset16 += 2;
|
||||
boffset += 32;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp05 = *(aoffset3 + 0);
|
||||
ctemp07 = *(aoffset4 + 0);
|
||||
ctemp09 = *(aoffset5 + 0);
|
||||
ctemp11 = *(aoffset6 + 0);
|
||||
ctemp13 = *(aoffset7 + 0);
|
||||
ctemp15 = *(aoffset8 + 0);
|
||||
|
||||
ctemp17 = *(aoffset9 + 0);
|
||||
ctemp19 = *(aoffset10 + 0);
|
||||
ctemp21 = *(aoffset11 + 0);
|
||||
ctemp23 = *(aoffset12 + 0);
|
||||
ctemp25 = *(aoffset13 + 0);
|
||||
ctemp27 = *(aoffset14 + 0);
|
||||
ctemp29 = *(aoffset15 + 0);
|
||||
ctemp31 = *(aoffset16 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp03;
|
||||
*(boffset + 2) = ctemp05;
|
||||
*(boffset + 3) = ctemp07;
|
||||
*(boffset + 4) = ctemp09;
|
||||
*(boffset + 5) = ctemp11;
|
||||
*(boffset + 6) = ctemp13;
|
||||
*(boffset + 7) = ctemp15;
|
||||
|
||||
*(boffset + 8) = ctemp17;
|
||||
*(boffset + 9) = ctemp19;
|
||||
*(boffset + 10) = ctemp21;
|
||||
*(boffset + 11) = ctemp23;
|
||||
*(boffset + 12) = ctemp25;
|
||||
*(boffset + 13) = ctemp27;
|
||||
*(boffset + 14) = ctemp29;
|
||||
*(boffset + 15) = ctemp31;
|
||||
|
||||
boffset += 16;
|
||||
}
|
||||
j--;
|
||||
}while(j > 0);
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
if (n & 8){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
|
||||
ctemp05 = *(aoffset3 + 0);
|
||||
ctemp06 = *(aoffset3 + 1);
|
||||
ctemp07 = *(aoffset4 + 0);
|
||||
ctemp08 = *(aoffset4 + 1);
|
||||
|
||||
ctemp09 = *(aoffset5 + 0);
|
||||
ctemp10 = *(aoffset5 + 1);
|
||||
ctemp11 = *(aoffset6 + 0);
|
||||
ctemp12 = *(aoffset6 + 1);
|
||||
|
||||
ctemp13 = *(aoffset7 + 0);
|
||||
ctemp14 = *(aoffset7 + 1);
|
||||
ctemp15 = *(aoffset8 + 0);
|
||||
ctemp16 = *(aoffset8 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
*(boffset + 2) = ctemp03;
|
||||
*(boffset + 3) = ctemp04;
|
||||
*(boffset + 4) = ctemp05;
|
||||
*(boffset + 5) = ctemp06;
|
||||
*(boffset + 6) = ctemp07;
|
||||
*(boffset + 7) = ctemp08;
|
||||
|
||||
*(boffset + 8) = ctemp09;
|
||||
*(boffset + 9) = ctemp10;
|
||||
*(boffset + 10) = ctemp11;
|
||||
*(boffset + 11) = ctemp12;
|
||||
*(boffset + 12) = ctemp13;
|
||||
*(boffset + 13) = ctemp14;
|
||||
*(boffset + 14) = ctemp15;
|
||||
*(boffset + 15) = ctemp16;
|
||||
|
||||
aoffset1 += 2;
|
||||
aoffset2 += 2;
|
||||
aoffset3 += 2;
|
||||
aoffset4 += 2;
|
||||
aoffset5 += 2;
|
||||
aoffset6 += 2;
|
||||
aoffset7 += 2;
|
||||
aoffset8 += 2;
|
||||
|
||||
boffset += 16;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp05 = *(aoffset3 + 0);
|
||||
ctemp07 = *(aoffset4 + 0);
|
||||
ctemp09 = *(aoffset5 + 0);
|
||||
ctemp11 = *(aoffset6 + 0);
|
||||
ctemp13 = *(aoffset7 + 0);
|
||||
ctemp15 = *(aoffset8 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp03;
|
||||
*(boffset + 2) = ctemp05;
|
||||
*(boffset + 3) = ctemp07;
|
||||
*(boffset + 4) = ctemp09;
|
||||
*(boffset + 5) = ctemp11;
|
||||
*(boffset + 6) = ctemp13;
|
||||
*(boffset + 7) = ctemp15;
|
||||
|
||||
boffset += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
|
||||
ctemp05 = *(aoffset3 + 0);
|
||||
ctemp06 = *(aoffset3 + 1);
|
||||
ctemp07 = *(aoffset4 + 0);
|
||||
ctemp08 = *(aoffset4 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
*(boffset + 2) = ctemp03;
|
||||
*(boffset + 3) = ctemp04;
|
||||
*(boffset + 4) = ctemp05;
|
||||
*(boffset + 5) = ctemp06;
|
||||
*(boffset + 6) = ctemp07;
|
||||
*(boffset + 7) = ctemp08;
|
||||
|
||||
aoffset1 += 2;
|
||||
aoffset2 += 2;
|
||||
aoffset3 += 2;
|
||||
aoffset4 += 2;
|
||||
boffset += 8;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp05 = *(aoffset3 + 0);
|
||||
ctemp07 = *(aoffset4 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp03;
|
||||
*(boffset + 2) = ctemp05;
|
||||
*(boffset + 3) = ctemp07;
|
||||
boffset += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset += 2 * lda;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp03;
|
||||
*(boffset + 2) = ctemp02;
|
||||
*(boffset + 3) = ctemp04;
|
||||
|
||||
aoffset1 += 2;
|
||||
aoffset2 += 2;
|
||||
boffset += 4;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp03;
|
||||
boffset += 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1){
|
||||
aoffset1 = aoffset;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
|
||||
aoffset1 += 2;
|
||||
boffset += 2;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
// boffset += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
383
kernel/power/sbgemm_ncopy_8_power10.c
Normal file
383
kernel/power/sbgemm_ncopy_8_power10.c
Normal file
@@ -0,0 +1,383 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <altivec.h>
|
||||
#include "common.h"
|
||||
|
||||
typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16)));
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
|
||||
IFLOAT *boffset;
|
||||
vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04;
|
||||
vec_bf16 vtemp05, vtemp06, vtemp07, vtemp08;
|
||||
vec_bf16 vtemp09, vtemp10, vtemp11, vtemp12;
|
||||
vector char mask =
|
||||
{ 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
|
||||
vector char mask1 =
|
||||
{ 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 };
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17;
|
||||
IFLOAT ctemp25;
|
||||
IFLOAT ctemp33;
|
||||
IFLOAT ctemp41;
|
||||
IFLOAT ctemp49;
|
||||
IFLOAT ctemp57;
|
||||
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = (n >> 3);
|
||||
if (j > 0){
|
||||
do{
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
i = (m >> 3);
|
||||
if (i > 0){
|
||||
do{
|
||||
vtemp01 = *(vec_bf16 *)(aoffset1);
|
||||
vtemp02 = *(vec_bf16 *)(aoffset2);
|
||||
vtemp03 = *(vec_bf16 *)(aoffset3);
|
||||
vtemp04 = *(vec_bf16 *)(aoffset4);
|
||||
vtemp05 = *(vec_bf16 *)(aoffset5);
|
||||
vtemp06 = *(vec_bf16 *)(aoffset6);
|
||||
vtemp07 = *(vec_bf16 *)(aoffset7);
|
||||
vtemp08 = *(vec_bf16 *)(aoffset8);
|
||||
|
||||
vtemp09 = vec_perm(vtemp01, vtemp02, mask);
|
||||
vtemp10 = vec_perm(vtemp03, vtemp04, mask);
|
||||
vtemp11 = vec_perm(vtemp05, vtemp06, mask);
|
||||
vtemp12 = vec_perm(vtemp07, vtemp08, mask);
|
||||
|
||||
*(vec_bf16 *)(boffset + 0) = vec_xxpermdi(vtemp09, vtemp10, 0);
|
||||
*(vec_bf16 *)(boffset + 8) = vec_xxpermdi(vtemp11, vtemp12, 0);
|
||||
*(vec_bf16 *)(boffset + 16) = vec_xxpermdi(vtemp09, vtemp10, 3);
|
||||
*(vec_bf16 *)(boffset + 24) = vec_xxpermdi(vtemp11, vtemp12, 3);
|
||||
|
||||
vtemp09 = vec_perm(vtemp01, vtemp02, mask1);
|
||||
vtemp10 = vec_perm(vtemp03, vtemp04, mask1);
|
||||
vtemp11 = vec_perm(vtemp05, vtemp06, mask1);
|
||||
vtemp12 = vec_perm(vtemp07, vtemp08, mask1);
|
||||
|
||||
*(vec_bf16 *)(boffset + 32) = vec_xxpermdi(vtemp09, vtemp10, 0);
|
||||
*(vec_bf16 *)(boffset + 40) = vec_xxpermdi(vtemp11, vtemp12, 0);
|
||||
*(vec_bf16 *)(boffset + 48) = vec_xxpermdi(vtemp09, vtemp10, 3);
|
||||
*(vec_bf16 *)(boffset + 56) = vec_xxpermdi(vtemp11, vtemp12, 3);
|
||||
|
||||
aoffset1 += 8;
|
||||
aoffset2 += 8;
|
||||
aoffset3 += 8;
|
||||
aoffset4 += 8;
|
||||
aoffset5 += 8;
|
||||
aoffset6 += 8;
|
||||
aoffset7 += 8;
|
||||
aoffset8 += 8;
|
||||
boffset += 64;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
i = (m & 7);
|
||||
if (i >= 2){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp09 = *(aoffset1 + 1);
|
||||
ctemp17 = *(aoffset2 + 0);
|
||||
ctemp25 = *(aoffset2 + 1);
|
||||
ctemp33 = *(aoffset3 + 0);
|
||||
ctemp41 = *(aoffset3 + 1);
|
||||
ctemp49 = *(aoffset4 + 0);
|
||||
ctemp57 = *(aoffset4 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp09;
|
||||
*(boffset + 2) = ctemp17;
|
||||
*(boffset + 3) = ctemp25;
|
||||
*(boffset + 4) = ctemp33;
|
||||
*(boffset + 5) = ctemp41;
|
||||
*(boffset + 6) = ctemp49;
|
||||
*(boffset + 7) = ctemp57;
|
||||
aoffset1 += 2;
|
||||
aoffset2 += 2;
|
||||
aoffset3 += 2;
|
||||
aoffset4 += 2;
|
||||
|
||||
ctemp01 = *(aoffset5 + 0);
|
||||
ctemp09 = *(aoffset5 + 1);
|
||||
ctemp17 = *(aoffset6 + 0);
|
||||
ctemp25 = *(aoffset6 + 1);
|
||||
ctemp33 = *(aoffset7 + 0);
|
||||
ctemp41 = *(aoffset7 + 1);
|
||||
ctemp49 = *(aoffset8 + 0);
|
||||
ctemp57 = *(aoffset8 + 1);
|
||||
*(boffset + 8) = ctemp01;
|
||||
*(boffset + 9) = ctemp09;
|
||||
*(boffset + 10) = ctemp17;
|
||||
*(boffset + 11) = ctemp25;
|
||||
*(boffset + 12) = ctemp33;
|
||||
*(boffset + 13) = ctemp41;
|
||||
*(boffset + 14) = ctemp49;
|
||||
*(boffset + 15) = ctemp57;
|
||||
|
||||
aoffset5 += 2;
|
||||
aoffset6 += 2;
|
||||
aoffset7 += 2;
|
||||
aoffset8 += 2;
|
||||
|
||||
boffset += 16;
|
||||
i -= 2;
|
||||
}while(i > 1);
|
||||
}
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp09 = *(aoffset2 + 0);
|
||||
ctemp17 = *(aoffset3 + 0);
|
||||
ctemp25 = *(aoffset4 + 0);
|
||||
ctemp33 = *(aoffset5 + 0);
|
||||
ctemp41 = *(aoffset6 + 0);
|
||||
ctemp49 = *(aoffset7 + 0);
|
||||
ctemp57 = *(aoffset8 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp09;
|
||||
*(boffset + 2) = ctemp17;
|
||||
*(boffset + 3) = ctemp25;
|
||||
*(boffset + 4) = ctemp33;
|
||||
*(boffset + 5) = ctemp41;
|
||||
*(boffset + 6) = ctemp49;
|
||||
*(boffset + 7) = ctemp57;
|
||||
|
||||
aoffset1 ++;
|
||||
aoffset2 ++;
|
||||
aoffset3 ++;
|
||||
aoffset4 ++;
|
||||
aoffset5 ++;
|
||||
aoffset6 ++;
|
||||
aoffset7 ++;
|
||||
aoffset8 ++;
|
||||
|
||||
boffset += 8;
|
||||
}
|
||||
|
||||
j--;
|
||||
}while(j > 0);
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
if (n & 4){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
i = (m >> 2);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
|
||||
ctemp05 = *(aoffset2 + 0);
|
||||
ctemp06 = *(aoffset2 + 1);
|
||||
ctemp07 = *(aoffset2 + 2);
|
||||
ctemp08 = *(aoffset2 + 3);
|
||||
|
||||
ctemp09 = *(aoffset3 + 0);
|
||||
ctemp10 = *(aoffset3 + 1);
|
||||
ctemp11 = *(aoffset3 + 2);
|
||||
ctemp12 = *(aoffset3 + 3);
|
||||
|
||||
ctemp13 = *(aoffset4 + 0);
|
||||
ctemp14 = *(aoffset4 + 1);
|
||||
ctemp15 = *(aoffset4 + 2);
|
||||
ctemp16 = *(aoffset4 + 3);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
*(boffset + 2) = ctemp05;
|
||||
*(boffset + 3) = ctemp06;
|
||||
|
||||
*(boffset + 4) = ctemp09;
|
||||
*(boffset + 5) = ctemp10;
|
||||
*(boffset + 6) = ctemp13;
|
||||
*(boffset + 7) = ctemp14;
|
||||
|
||||
*(boffset + 8) = ctemp03;
|
||||
*(boffset + 9) = ctemp04;
|
||||
*(boffset + 10) = ctemp07;
|
||||
*(boffset + 11) = ctemp08;
|
||||
|
||||
*(boffset + 12) = ctemp11;
|
||||
*(boffset + 13) = ctemp12;
|
||||
*(boffset + 14) = ctemp15;
|
||||
*(boffset + 15) = ctemp16;
|
||||
|
||||
aoffset1 += 4;
|
||||
aoffset2 += 4;
|
||||
aoffset3 += 4;
|
||||
aoffset4 += 4;
|
||||
boffset += 16;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
i = (m & 3);
|
||||
if (i >= 2){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp09 = *(aoffset1 + 1);
|
||||
ctemp17 = *(aoffset2 + 0);
|
||||
ctemp25 = *(aoffset2 + 1);
|
||||
ctemp33 = *(aoffset3 + 0);
|
||||
ctemp41 = *(aoffset3 + 1);
|
||||
ctemp49 = *(aoffset4 + 0);
|
||||
ctemp57 = *(aoffset4 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp09;
|
||||
*(boffset + 2) = ctemp17;
|
||||
*(boffset + 3) = ctemp25;
|
||||
*(boffset + 4) = ctemp33;
|
||||
*(boffset + 5) = ctemp41;
|
||||
*(boffset + 6) = ctemp49;
|
||||
*(boffset + 7) = ctemp57;
|
||||
aoffset1 += 2;
|
||||
aoffset2 += 2;
|
||||
aoffset3 += 2;
|
||||
aoffset4 += 2;
|
||||
|
||||
boffset += 8;
|
||||
i -= 2;
|
||||
}while(i > 1);
|
||||
}
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset2 + 0);
|
||||
ctemp03 = *(aoffset3 + 0);
|
||||
ctemp04 = *(aoffset4 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
*(boffset + 2) = ctemp03;
|
||||
*(boffset + 3) = ctemp04;
|
||||
|
||||
aoffset1 ++;
|
||||
aoffset2 ++;
|
||||
aoffset3 ++;
|
||||
aoffset4 ++;
|
||||
|
||||
boffset += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset += 2 * lda;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp03;
|
||||
*(boffset + 2) = ctemp02;
|
||||
*(boffset + 3) = ctemp04;
|
||||
|
||||
aoffset1 += 2;
|
||||
aoffset2 += 2;
|
||||
boffset += 4;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset2 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
|
||||
aoffset1 ++;
|
||||
aoffset2 ++;
|
||||
boffset += 2;
|
||||
}
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
if (n & 1){
|
||||
aoffset1 = aoffset;
|
||||
|
||||
i = m;
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
|
||||
aoffset1 ++;
|
||||
boffset ++;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
return 0;
|
||||
}
|
||||
244
kernel/power/sbgemm_tcopy_16_power10.c
Normal file
244
kernel/power/sbgemm_tcopy_16_power10.c
Normal file
@@ -0,0 +1,244 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <altivec.h>
|
||||
#include "common.h"
|
||||
typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16)));
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2;
|
||||
IFLOAT *boffset;
|
||||
|
||||
vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "m = %d n = %d\n", m, n);
|
||||
#endif
|
||||
|
||||
j = (n >> 4);
|
||||
if (j > 0){
|
||||
do{
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 16;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
vtemp01 = *(vec_bf16 *)(aoffset1);
|
||||
vtemp02 = *(vec_bf16 *)(aoffset1+8);
|
||||
vtemp03 = *(vec_bf16 *)(aoffset2);
|
||||
vtemp04 = *(vec_bf16 *)(aoffset2+8);
|
||||
*(vec_bf16 *)(boffset + 0) = vec_mergeh(vtemp01, vtemp03);
|
||||
*(vec_bf16 *)(boffset + 8) = vec_mergel(vtemp01, vtemp03);
|
||||
*(vec_bf16 *)(boffset + 16) = vec_mergeh(vtemp02, vtemp04);
|
||||
*(vec_bf16 *)(boffset + 24) = vec_mergel(vtemp02, vtemp04);
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 32;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
vtemp01 = *(vec_bf16 *)(aoffset1);
|
||||
vtemp02 = *(vec_bf16 *)(aoffset1+8);
|
||||
*(vec_bf16 *)(boffset + 0) = vtemp01;
|
||||
*(vec_bf16 *)(boffset + 8) = vtemp02;
|
||||
boffset += 16;
|
||||
}
|
||||
|
||||
j--;
|
||||
}while(j > 0);
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
if (n & 8){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 8;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
vtemp01 = *(vec_bf16 *)(aoffset1);
|
||||
vtemp03 = *(vec_bf16 *)(aoffset2);
|
||||
*(vec_bf16 *)(boffset + 0) = vec_mergeh(vtemp01, vtemp03);
|
||||
*(vec_bf16 *)(boffset + 8) = vec_mergel(vtemp01, vtemp03);
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 16;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
vtemp01 = *(vec_bf16 *)(aoffset1);
|
||||
*(vec_bf16 *)(boffset + 0) = vtemp01;
|
||||
boffset += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 4;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
|
||||
ctemp05 = *(aoffset2 + 0);
|
||||
ctemp06 = *(aoffset2 + 1);
|
||||
ctemp07 = *(aoffset2 + 2);
|
||||
ctemp08 = *(aoffset2 + 3);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp05;
|
||||
*(boffset + 2) = ctemp02;
|
||||
*(boffset + 3) = ctemp06;
|
||||
*(boffset + 4) = ctemp03;
|
||||
*(boffset + 5) = ctemp07;
|
||||
*(boffset + 6) = ctemp04;
|
||||
*(boffset + 7) = ctemp08;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 8;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
*(boffset + 2) = ctemp03;
|
||||
*(boffset + 3) = ctemp04;
|
||||
|
||||
boffset += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 2;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
*(boffset + 2) = ctemp03;
|
||||
*(boffset + 3) = ctemp04;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 4;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
boffset += 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset2 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 2;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
*(boffset + 0) = ctemp01;
|
||||
// boffset += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
659
kernel/power/sbgemm_tcopy_8_power10.c
Normal file
659
kernel/power/sbgemm_tcopy_8_power10.c
Normal file
@@ -0,0 +1,659 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16)));
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
|
||||
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
||||
vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04;
|
||||
vec_bf16 vtemp05, vtemp06, vtemp07, vtemp08;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "M = %d N = %d\n", m, n);
|
||||
#endif
|
||||
|
||||
boffset2 = b + m * (n & ~7);
|
||||
boffset3 = b + m * (n & ~3);
|
||||
boffset4 = b + m * (n & ~1);
|
||||
|
||||
j = (m >> 3);
|
||||
if (j > 0){
|
||||
do{
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
boffset += 64;
|
||||
|
||||
i = (n >> 3);
|
||||
if (i > 0){
|
||||
do{
|
||||
vtemp01 = *(vec_bf16 *)(aoffset1);
|
||||
vtemp02 = *(vec_bf16 *)(aoffset2);
|
||||
vtemp03 = *(vec_bf16 *)(aoffset3);
|
||||
vtemp04 = *(vec_bf16 *)(aoffset4);
|
||||
vtemp05 = *(vec_bf16 *)(aoffset5);
|
||||
vtemp06 = *(vec_bf16 *)(aoffset6);
|
||||
vtemp07 = *(vec_bf16 *)(aoffset7);
|
||||
vtemp08 = *(vec_bf16 *)(aoffset8);
|
||||
aoffset1 += 8;
|
||||
aoffset2 += 8;
|
||||
aoffset3 += 8;
|
||||
aoffset4 += 8;
|
||||
aoffset5 += 8;
|
||||
aoffset6 += 8;
|
||||
aoffset7 += 8;
|
||||
aoffset8 += 8;
|
||||
|
||||
*(vec_bf16 *)(boffset1 + 0) = vec_mergeh(vtemp01, vtemp02);
|
||||
*(vec_bf16 *)(boffset1 + 8) = vec_mergel(vtemp01, vtemp02);
|
||||
*(vec_bf16 *)(boffset1 + 16) = vec_mergeh(vtemp03, vtemp04);
|
||||
*(vec_bf16 *)(boffset1 + 24) = vec_mergel(vtemp03, vtemp04);
|
||||
*(vec_bf16 *)(boffset1 + 32) = vec_mergeh(vtemp05, vtemp06);
|
||||
*(vec_bf16 *)(boffset1 + 40) = vec_mergel(vtemp05, vtemp06);
|
||||
*(vec_bf16 *)(boffset1 + 48) = vec_mergeh(vtemp07, vtemp08);
|
||||
*(vec_bf16 *)(boffset1 + 56) = vec_mergel(vtemp07, vtemp08);
|
||||
|
||||
boffset1 += m * 8;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (n & 4){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
aoffset1 += 4;
|
||||
|
||||
ctemp05 = *(aoffset2 + 0);
|
||||
ctemp06 = *(aoffset2 + 1);
|
||||
ctemp07 = *(aoffset2 + 2);
|
||||
ctemp08 = *(aoffset2 + 3);
|
||||
aoffset2 += 4;
|
||||
|
||||
ctemp09 = *(aoffset3 + 0);
|
||||
ctemp10 = *(aoffset3 + 1);
|
||||
ctemp11 = *(aoffset3 + 2);
|
||||
ctemp12 = *(aoffset3 + 3);
|
||||
aoffset3 += 4;
|
||||
|
||||
ctemp13 = *(aoffset4 + 0);
|
||||
ctemp14 = *(aoffset4 + 1);
|
||||
ctemp15 = *(aoffset4 + 2);
|
||||
ctemp16 = *(aoffset4 + 3);
|
||||
aoffset4 += 4;
|
||||
|
||||
ctemp17 = *(aoffset5 + 0);
|
||||
ctemp18 = *(aoffset5 + 1);
|
||||
ctemp19 = *(aoffset5 + 2);
|
||||
ctemp20 = *(aoffset5 + 3);
|
||||
aoffset5 += 4;
|
||||
|
||||
ctemp21 = *(aoffset6 + 0);
|
||||
ctemp22 = *(aoffset6 + 1);
|
||||
ctemp23 = *(aoffset6 + 2);
|
||||
ctemp24 = *(aoffset6 + 3);
|
||||
aoffset6 += 4;
|
||||
|
||||
ctemp25 = *(aoffset7 + 0);
|
||||
ctemp26 = *(aoffset7 + 1);
|
||||
ctemp27 = *(aoffset7 + 2);
|
||||
ctemp28 = *(aoffset7 + 3);
|
||||
aoffset7 += 4;
|
||||
|
||||
ctemp29 = *(aoffset8 + 0);
|
||||
ctemp30 = *(aoffset8 + 1);
|
||||
ctemp31 = *(aoffset8 + 2);
|
||||
ctemp32 = *(aoffset8 + 3);
|
||||
aoffset8 += 4;
|
||||
|
||||
*(boffset2 + 0) = ctemp01;
|
||||
*(boffset2 + 1) = ctemp05;
|
||||
*(boffset2 + 2) = ctemp02;
|
||||
*(boffset2 + 3) = ctemp06;
|
||||
*(boffset2 + 4) = ctemp03;
|
||||
*(boffset2 + 5) = ctemp07;
|
||||
*(boffset2 + 6) = ctemp04;
|
||||
*(boffset2 + 7) = ctemp08;
|
||||
|
||||
*(boffset2 + 8) = ctemp09;
|
||||
*(boffset2 + 9) = ctemp13;
|
||||
*(boffset2 + 10) = ctemp10;
|
||||
*(boffset2 + 11) = ctemp14;
|
||||
*(boffset2 + 12) = ctemp11;
|
||||
*(boffset2 + 13) = ctemp15;
|
||||
*(boffset2 + 14) = ctemp12;
|
||||
*(boffset2 + 15) = ctemp16;
|
||||
|
||||
*(boffset2 + 16) = ctemp17;
|
||||
*(boffset2 + 17) = ctemp21;
|
||||
*(boffset2 + 18) = ctemp18;
|
||||
*(boffset2 + 19) = ctemp22;
|
||||
*(boffset2 + 20) = ctemp19;
|
||||
*(boffset2 + 21) = ctemp23;
|
||||
*(boffset2 + 22) = ctemp20;
|
||||
*(boffset2 + 23) = ctemp24;
|
||||
|
||||
*(boffset2 + 24) = ctemp25;
|
||||
*(boffset2 + 25) = ctemp29;
|
||||
*(boffset2 + 26) = ctemp26;
|
||||
*(boffset2 + 27) = ctemp30;
|
||||
*(boffset2 + 28) = ctemp27;
|
||||
*(boffset2 + 29) = ctemp31;
|
||||
*(boffset2 + 30) = ctemp28;
|
||||
*(boffset2 + 31) = ctemp32;
|
||||
|
||||
boffset2 += 32;
|
||||
}
|
||||
|
||||
if (n & 2){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
aoffset1 += 2;
|
||||
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
aoffset2 += 2;
|
||||
|
||||
ctemp05 = *(aoffset3 + 0);
|
||||
ctemp06 = *(aoffset3 + 1);
|
||||
aoffset3 += 2;
|
||||
|
||||
ctemp07 = *(aoffset4 + 0);
|
||||
ctemp08 = *(aoffset4 + 1);
|
||||
aoffset4 += 2;
|
||||
|
||||
ctemp09 = *(aoffset5 + 0);
|
||||
ctemp10 = *(aoffset5 + 1);
|
||||
aoffset5 += 2;
|
||||
|
||||
ctemp11 = *(aoffset6 + 0);
|
||||
ctemp12 = *(aoffset6 + 1);
|
||||
aoffset6 += 2;
|
||||
|
||||
ctemp13 = *(aoffset7 + 0);
|
||||
ctemp14 = *(aoffset7 + 1);
|
||||
aoffset7 += 2;
|
||||
|
||||
ctemp15 = *(aoffset8 + 0);
|
||||
ctemp16 = *(aoffset8 + 1);
|
||||
aoffset8 += 2;
|
||||
|
||||
*(boffset3 + 0) = ctemp01;
|
||||
*(boffset3 + 1) = ctemp02;
|
||||
*(boffset3 + 2) = ctemp03;
|
||||
*(boffset3 + 3) = ctemp04;
|
||||
*(boffset3 + 4) = ctemp05;
|
||||
*(boffset3 + 5) = ctemp06;
|
||||
*(boffset3 + 6) = ctemp07;
|
||||
*(boffset3 + 7) = ctemp08;
|
||||
*(boffset3 + 8) = ctemp09;
|
||||
*(boffset3 + 9) = ctemp10;
|
||||
*(boffset3 + 10) = ctemp11;
|
||||
*(boffset3 + 11) = ctemp12;
|
||||
*(boffset3 + 12) = ctemp13;
|
||||
*(boffset3 + 13) = ctemp14;
|
||||
*(boffset3 + 14) = ctemp15;
|
||||
*(boffset3 + 15) = ctemp16;
|
||||
boffset3 += 16;
|
||||
}
|
||||
|
||||
if (n & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
aoffset1 ++;
|
||||
ctemp02 = *(aoffset2 + 0);
|
||||
aoffset2 ++;
|
||||
ctemp03 = *(aoffset3 + 0);
|
||||
aoffset3 ++;
|
||||
ctemp04 = *(aoffset4 + 0);
|
||||
aoffset4 ++;
|
||||
ctemp05 = *(aoffset5 + 0);
|
||||
aoffset5 ++;
|
||||
ctemp06 = *(aoffset6 + 0);
|
||||
aoffset6 ++;
|
||||
ctemp07 = *(aoffset7 + 0);
|
||||
aoffset7 ++;
|
||||
ctemp08 = *(aoffset8 + 0);
|
||||
aoffset8 ++;
|
||||
|
||||
*(boffset4 + 0) = ctemp01;
|
||||
*(boffset4 + 1) = ctemp02;
|
||||
*(boffset4 + 2) = ctemp03;
|
||||
*(boffset4 + 3) = ctemp04;
|
||||
*(boffset4 + 4) = ctemp05;
|
||||
*(boffset4 + 5) = ctemp06;
|
||||
*(boffset4 + 6) = ctemp07;
|
||||
*(boffset4 + 7) = ctemp08;
|
||||
boffset4 += 8;
|
||||
}
|
||||
|
||||
j--;
|
||||
}while(j > 0);
|
||||
}
|
||||
|
||||
if (m & 4){
|
||||
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
boffset += 32;
|
||||
|
||||
i = (n >> 3);
|
||||
if (i > 0){
|
||||
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
aoffset1 += 8;
|
||||
|
||||
ctemp09 = *(aoffset2 + 0);
|
||||
ctemp10 = *(aoffset2 + 1);
|
||||
ctemp11 = *(aoffset2 + 2);
|
||||
ctemp12 = *(aoffset2 + 3);
|
||||
ctemp13 = *(aoffset2 + 4);
|
||||
ctemp14 = *(aoffset2 + 5);
|
||||
ctemp15 = *(aoffset2 + 6);
|
||||
ctemp16 = *(aoffset2 + 7);
|
||||
aoffset2 += 8;
|
||||
|
||||
ctemp17 = *(aoffset3 + 0);
|
||||
ctemp18 = *(aoffset3 + 1);
|
||||
ctemp19 = *(aoffset3 + 2);
|
||||
ctemp20 = *(aoffset3 + 3);
|
||||
ctemp21 = *(aoffset3 + 4);
|
||||
ctemp22 = *(aoffset3 + 5);
|
||||
ctemp23 = *(aoffset3 + 6);
|
||||
ctemp24 = *(aoffset3 + 7);
|
||||
aoffset3 += 8;
|
||||
|
||||
ctemp25 = *(aoffset4 + 0);
|
||||
ctemp26 = *(aoffset4 + 1);
|
||||
ctemp27 = *(aoffset4 + 2);
|
||||
ctemp28 = *(aoffset4 + 3);
|
||||
ctemp29 = *(aoffset4 + 4);
|
||||
ctemp30 = *(aoffset4 + 5);
|
||||
ctemp31 = *(aoffset4 + 6);
|
||||
ctemp32 = *(aoffset4 + 7);
|
||||
aoffset4 += 8;
|
||||
|
||||
*(boffset1 + 0) = ctemp01;
|
||||
*(boffset1 + 1) = ctemp09;
|
||||
*(boffset1 + 2) = ctemp02;
|
||||
*(boffset1 + 3) = ctemp10;
|
||||
*(boffset1 + 4) = ctemp03;
|
||||
*(boffset1 + 5) = ctemp11;
|
||||
*(boffset1 + 6) = ctemp04;
|
||||
*(boffset1 + 7) = ctemp12;
|
||||
|
||||
*(boffset1 + 8) = ctemp05;
|
||||
*(boffset1 + 9) = ctemp13;
|
||||
*(boffset1 + 10) = ctemp06;
|
||||
*(boffset1 + 11) = ctemp14;
|
||||
*(boffset1 + 12) = ctemp07;
|
||||
*(boffset1 + 13) = ctemp15;
|
||||
*(boffset1 + 14) = ctemp08;
|
||||
*(boffset1 + 15) = ctemp16;
|
||||
|
||||
*(boffset1 + 16) = ctemp17;
|
||||
*(boffset1 + 17) = ctemp25;
|
||||
*(boffset1 + 18) = ctemp18;
|
||||
*(boffset1 + 19) = ctemp26;
|
||||
*(boffset1 + 20) = ctemp19;
|
||||
*(boffset1 + 21) = ctemp27;
|
||||
*(boffset1 + 22) = ctemp20;
|
||||
*(boffset1 + 23) = ctemp28;
|
||||
|
||||
*(boffset1 + 24) = ctemp21;
|
||||
*(boffset1 + 25) = ctemp29;
|
||||
*(boffset1 + 26) = ctemp22;
|
||||
*(boffset1 + 27) = ctemp30;
|
||||
*(boffset1 + 28) = ctemp23;
|
||||
*(boffset1 + 29) = ctemp31;
|
||||
*(boffset1 + 30) = ctemp24;
|
||||
*(boffset1 + 31) = ctemp32;
|
||||
|
||||
boffset1 += 8 * m;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
aoffset1 += 4;
|
||||
|
||||
ctemp05 = *(aoffset2 + 0);
|
||||
ctemp06 = *(aoffset2 + 1);
|
||||
ctemp07 = *(aoffset2 + 2);
|
||||
ctemp08 = *(aoffset2 + 3);
|
||||
aoffset2 += 4;
|
||||
|
||||
ctemp09 = *(aoffset3 + 0);
|
||||
ctemp10 = *(aoffset3 + 1);
|
||||
ctemp11 = *(aoffset3 + 2);
|
||||
ctemp12 = *(aoffset3 + 3);
|
||||
aoffset3 += 4;
|
||||
|
||||
ctemp13 = *(aoffset4 + 0);
|
||||
ctemp14 = *(aoffset4 + 1);
|
||||
ctemp15 = *(aoffset4 + 2);
|
||||
ctemp16 = *(aoffset4 + 3);
|
||||
aoffset4 += 4;
|
||||
|
||||
*(boffset2 + 0) = ctemp01;
|
||||
*(boffset2 + 1) = ctemp05;
|
||||
*(boffset2 + 2) = ctemp02;
|
||||
*(boffset2 + 3) = ctemp06;
|
||||
*(boffset2 + 4) = ctemp03;
|
||||
*(boffset2 + 5) = ctemp07;
|
||||
*(boffset2 + 6) = ctemp04;
|
||||
*(boffset2 + 7) = ctemp08;
|
||||
|
||||
*(boffset2 + 8) = ctemp09;
|
||||
*(boffset2 + 9) = ctemp13;
|
||||
*(boffset2 + 10) = ctemp10;
|
||||
*(boffset2 + 11) = ctemp14;
|
||||
*(boffset2 + 12) = ctemp11;
|
||||
*(boffset2 + 13) = ctemp15;
|
||||
*(boffset2 + 14) = ctemp12;
|
||||
*(boffset2 + 15) = ctemp16;
|
||||
boffset2 += 16;
|
||||
}
|
||||
|
||||
if (n & 2){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
aoffset1 += 2;
|
||||
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
aoffset2 += 2;
|
||||
|
||||
ctemp05 = *(aoffset3 + 0);
|
||||
ctemp06 = *(aoffset3 + 1);
|
||||
aoffset3 += 2;
|
||||
|
||||
ctemp07 = *(aoffset4 + 0);
|
||||
ctemp08 = *(aoffset4 + 1);
|
||||
aoffset4 += 2;
|
||||
|
||||
*(boffset3 + 0) = ctemp01;
|
||||
*(boffset3 + 1) = ctemp02;
|
||||
*(boffset3 + 2) = ctemp03;
|
||||
*(boffset3 + 3) = ctemp04;
|
||||
*(boffset3 + 4) = ctemp05;
|
||||
*(boffset3 + 5) = ctemp06;
|
||||
*(boffset3 + 6) = ctemp07;
|
||||
*(boffset3 + 7) = ctemp08;
|
||||
boffset3 += 8;
|
||||
}
|
||||
|
||||
if (n & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
aoffset1 ++;
|
||||
ctemp02 = *(aoffset2 + 0);
|
||||
aoffset2 ++;
|
||||
ctemp03 = *(aoffset3 + 0);
|
||||
aoffset3 ++;
|
||||
ctemp04 = *(aoffset4 + 0);
|
||||
aoffset4 ++;
|
||||
|
||||
*(boffset4 + 0) = ctemp01;
|
||||
*(boffset4 + 1) = ctemp02;
|
||||
*(boffset4 + 2) = ctemp03;
|
||||
*(boffset4 + 3) = ctemp04;
|
||||
boffset4 += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 2){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset += 2 * lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
boffset += 16;
|
||||
|
||||
i = (n >> 3);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
aoffset1 += 8;
|
||||
|
||||
ctemp09 = *(aoffset2 + 0);
|
||||
ctemp10 = *(aoffset2 + 1);
|
||||
ctemp11 = *(aoffset2 + 2);
|
||||
ctemp12 = *(aoffset2 + 3);
|
||||
ctemp13 = *(aoffset2 + 4);
|
||||
ctemp14 = *(aoffset2 + 5);
|
||||
ctemp15 = *(aoffset2 + 6);
|
||||
ctemp16 = *(aoffset2 + 7);
|
||||
aoffset2 += 8;
|
||||
|
||||
*(boffset1 + 0) = ctemp01;
|
||||
*(boffset1 + 1) = ctemp09;
|
||||
*(boffset1 + 2) = ctemp02;
|
||||
*(boffset1 + 3) = ctemp10;
|
||||
*(boffset1 + 4) = ctemp03;
|
||||
*(boffset1 + 5) = ctemp11;
|
||||
*(boffset1 + 6) = ctemp04;
|
||||
*(boffset1 + 7) = ctemp12;
|
||||
|
||||
*(boffset1 + 8) = ctemp05;
|
||||
*(boffset1 + 9) = ctemp13;
|
||||
*(boffset1 + 10) = ctemp06;
|
||||
*(boffset1 + 11) = ctemp14;
|
||||
*(boffset1 + 12) = ctemp07;
|
||||
*(boffset1 + 13) = ctemp15;
|
||||
*(boffset1 + 14) = ctemp08;
|
||||
*(boffset1 + 15) = ctemp16;
|
||||
|
||||
boffset1 += 8 * m;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (n & 4){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
aoffset1 += 4;
|
||||
|
||||
ctemp05 = *(aoffset2 + 0);
|
||||
ctemp06 = *(aoffset2 + 1);
|
||||
ctemp07 = *(aoffset2 + 2);
|
||||
ctemp08 = *(aoffset2 + 3);
|
||||
aoffset2 += 4;
|
||||
|
||||
*(boffset2 + 0) = ctemp01;
|
||||
*(boffset2 + 1) = ctemp05;
|
||||
*(boffset2 + 2) = ctemp02;
|
||||
*(boffset2 + 3) = ctemp06;
|
||||
*(boffset2 + 4) = ctemp03;
|
||||
*(boffset2 + 5) = ctemp07;
|
||||
*(boffset2 + 6) = ctemp04;
|
||||
*(boffset2 + 7) = ctemp08;
|
||||
boffset2 += 8;
|
||||
}
|
||||
|
||||
if (n & 2){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
aoffset1 += 2;
|
||||
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
aoffset2 += 2;
|
||||
|
||||
*(boffset3 + 0) = ctemp01;
|
||||
*(boffset3 + 1) = ctemp02;
|
||||
*(boffset3 + 2) = ctemp03;
|
||||
*(boffset3 + 3) = ctemp04;
|
||||
boffset3 += 4;
|
||||
}
|
||||
|
||||
if (n & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
aoffset1 ++;
|
||||
ctemp02 = *(aoffset2 + 0);
|
||||
aoffset2 ++;
|
||||
|
||||
*(boffset4 + 0) = ctemp01;
|
||||
*(boffset4 + 1) = ctemp02;
|
||||
boffset4 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
aoffset1 = aoffset;
|
||||
// aoffset += lda;
|
||||
|
||||
boffset1 = boffset;
|
||||
// boffset += 8;
|
||||
|
||||
i = (n >> 3);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
aoffset1 += 8;
|
||||
|
||||
*(boffset1 + 0) = ctemp01;
|
||||
*(boffset1 + 1) = ctemp02;
|
||||
*(boffset1 + 2) = ctemp03;
|
||||
*(boffset1 + 3) = ctemp04;
|
||||
*(boffset1 + 4) = ctemp05;
|
||||
*(boffset1 + 5) = ctemp06;
|
||||
*(boffset1 + 6) = ctemp07;
|
||||
*(boffset1 + 7) = ctemp08;
|
||||
|
||||
boffset1 += 8 * m;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (n & 4){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
aoffset1 += 4;
|
||||
|
||||
*(boffset2 + 0) = ctemp01;
|
||||
*(boffset2 + 1) = ctemp02;
|
||||
*(boffset2 + 2) = ctemp03;
|
||||
*(boffset2 + 3) = ctemp04;
|
||||
// boffset2 += 4;
|
||||
}
|
||||
|
||||
if (n & 2){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
aoffset1 += 2;
|
||||
|
||||
*(boffset3 + 0) = ctemp01;
|
||||
*(boffset3 + 1) = ctemp02;
|
||||
// boffset3 += 2;
|
||||
}
|
||||
|
||||
if (n & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
aoffset1 ++;
|
||||
*(boffset4 + 0) = ctemp01;
|
||||
boffset4 ++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "scopy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
||||
@@ -35,9 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
#include "sdot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
1386
kernel/power/sgemm_kernel_power10.c
Normal file
1386
kernel/power/sgemm_kernel_power10.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/gemv_n.c"
|
||||
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
|
||||
@@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||
return(0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/gemv_t.c"
|
||||
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
|
||||
@@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "srot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "sscal_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(HAVE_KERNEL_16)
|
||||
|
||||
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "sswap_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
||||
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zasum_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zaxpy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_4
|
||||
|
||||
200
kernel/power/zaxpy_microk_power10.c
Normal file
200
kernel/power/zaxpy_microk_power10.c
Normal file
@@ -0,0 +1,200 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4 1
|
||||
static void zaxpy_kernel_4 (long n, double *x, double *y,
|
||||
double alpha_r, double alpha_i)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
static const double mvec[2] = { 1.0, -1.0 };
|
||||
#else
|
||||
static const double mvec[2] = { -1.0, 1.0 };
|
||||
#endif
|
||||
const double *mvecp = mvec;
|
||||
|
||||
__vector double t0;
|
||||
__vector double t1;
|
||||
__vector double t2;
|
||||
__vector double t3;
|
||||
__vector double t4;
|
||||
__vector double t5;
|
||||
__vector double t6;
|
||||
__vector double t7;
|
||||
long ytmp;
|
||||
|
||||
__asm__
|
||||
(
|
||||
XXSPLTD_S(32,%x15,0) // alpha_r
|
||||
XXSPLTD_S(33,%x16,0) // alpha_i
|
||||
"lxvd2x 36, 0, %17 \n\t" // mvec
|
||||
|
||||
#if !defined(CONJ)
|
||||
"xvmuldp 33, 33, 36 \n\t" // alpha_i * mvec
|
||||
#else
|
||||
"xvmuldp 32, 32, 36 \n\t" // alpha_r * mvec
|
||||
#endif
|
||||
|
||||
"mr %12, %3 \n\t"
|
||||
"dcbt 0, %2 \n\t"
|
||||
"dcbt 0, %3 \n\t"
|
||||
|
||||
|
||||
"lxvp 40, 0(%2) \n\t" // x0
|
||||
"lxvp 42, 32(%2) \n\t" // x2
|
||||
"lxvp 48, 0(%3) \n\t" // y0
|
||||
"lxvp 50, 32(%3) \n\t" // y2
|
||||
|
||||
XXSWAPD_S(%x4,40) // exchange real and imag part
|
||||
XXSWAPD_S(%x5,41) // exchange real and imag part
|
||||
XXSWAPD_S(%x6,42) // exchange real and imag part
|
||||
XXSWAPD_S(%x7,43) // exchange real and imag part
|
||||
|
||||
"lxvp 44, 64(%2) \n\t" // x4
|
||||
"lxvp 46, 96(%2) \n\t" // x6
|
||||
"lxvp 34, 64(%3) \n\t" // y4
|
||||
"lxvp 38, 96(%3) \n\t" // y6
|
||||
|
||||
XXSWAPD_S(%x8,44) // exchange real and imag part
|
||||
XXSWAPD_S(%x9,45) // exchange real and imag part
|
||||
XXSWAPD_S(%x10,46) // exchange real and imag part
|
||||
XXSWAPD_S(%x11,47) // exchange real and imag part
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -8 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
||||
"xvmaddadp 49, 41, 32 \n\t"
|
||||
"lxvp 40, 0(%2) \n\t" // x0
|
||||
"xvmaddadp 50, 42, 32 \n\t"
|
||||
"xvmaddadp 51, 43, 32 \n\t"
|
||||
"lxvp 42, 32(%2) \n\t" // x2
|
||||
|
||||
"xvmaddadp 34, 44, 32 \n\t"
|
||||
"xvmaddadp 35, 45, 32 \n\t"
|
||||
"lxvp 44, 64(%2) \n\t" // x4
|
||||
"xvmaddadp 38, 46, 32 \n\t"
|
||||
"xvmaddadp 39, 47, 32 \n\t"
|
||||
"lxvp 46, 96(%2) \n\t" // x6
|
||||
|
||||
"xvmaddadp 48, %x4, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvmaddadp 49, %x5, 33 \n\t"
|
||||
"xvmaddadp 50, %x6, 33 \n\t"
|
||||
"xvmaddadp 51, %x7, 33 \n\t"
|
||||
|
||||
"xvmaddadp 34, %x8, 33 \n\t"
|
||||
"xvmaddadp 35, %x9, 33 \n\t"
|
||||
"xvmaddadp 38, %x10, 33 \n\t"
|
||||
"xvmaddadp 39, %x11, 33 \n\t"
|
||||
|
||||
"stxvp 48, 0(%12) \n\t"
|
||||
"stxvp 50, 32(%12) \n\t"
|
||||
"stxvp 34, 64(%12) \n\t"
|
||||
"stxvp 38, 96(%12) \n\t"
|
||||
|
||||
"addi %12, %12, 128 \n\t"
|
||||
|
||||
XXSWAPD_S(%x4,40) // exchange real and imag part
|
||||
XXSWAPD_S(%x5,41) // exchange real and imag part
|
||||
"lxvp 48, 0(%3) \n\t" // y0
|
||||
XXSWAPD_S(%x6,42) // exchange real and imag part
|
||||
XXSWAPD_S(%x7,43) // exchange real and imag part
|
||||
"lxvp 50, 32(%3) \n\t" // y2
|
||||
|
||||
XXSWAPD_S(%x8,44) // exchange real and imag part
|
||||
XXSWAPD_S(%x9,45) // exchange real and imag part
|
||||
"lxvp 34, 64(%3) \n\t" // y4
|
||||
XXSWAPD_S(%x10,46) // exchange real and imag part
|
||||
XXSWAPD_S(%x11,47) // exchange real and imag part
|
||||
"lxvp 38, 96(%3) \n\t" // y6
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -8 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
||||
"xvmaddadp 49, 41, 32 \n\t"
|
||||
"xvmaddadp 50, 42, 32 \n\t"
|
||||
"xvmaddadp 51, 43, 32 \n\t"
|
||||
|
||||
"xvmaddadp 34, 44, 32 \n\t"
|
||||
"xvmaddadp 35, 45, 32 \n\t"
|
||||
"xvmaddadp 38, 46, 32 \n\t"
|
||||
"xvmaddadp 39, 47, 32 \n\t"
|
||||
|
||||
"xvmaddadp 48, %x4, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
|
||||
"xvmaddadp 49, %x5, 33 \n\t"
|
||||
"xvmaddadp 50, %x6, 33 \n\t"
|
||||
"xvmaddadp 51, %x7, 33 \n\t"
|
||||
|
||||
"xvmaddadp 34, %x8, 33 \n\t"
|
||||
"xvmaddadp 35, %x9, 33 \n\t"
|
||||
"xvmaddadp 38, %x10, 33 \n\t"
|
||||
"xvmaddadp 39, %x11, 33 \n\t"
|
||||
|
||||
"stxvp 48, 0(%12) \n\t"
|
||||
"stxvp 50, 32(%12) \n\t"
|
||||
"stxvp 34, 64(%12) \n\t"
|
||||
"stxvp 38, 96(%12) \n\t"
|
||||
|
||||
"#n=%1 x=%13=%2 y=%0=%3 alpha=(%15,%16) mvecp=%14=%17 ytmp=%12\n"
|
||||
"#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11"
|
||||
:
|
||||
"+m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y), // 3
|
||||
"=wa" (t0), // 4
|
||||
"=wa" (t1), // 5
|
||||
"=wa" (t2), // 6
|
||||
"=wa" (t3), // 7
|
||||
"=wa" (t4), // 8
|
||||
"=wa" (t5), // 9
|
||||
"=wa" (t6), // 10
|
||||
"=wa" (t7), // 11
|
||||
"=b" (ytmp) // 12
|
||||
:
|
||||
"m" (*x),
|
||||
"m" (*mvecp),
|
||||
"d" (alpha_r), // 15
|
||||
"d" (alpha_i), // 16
|
||||
"12" (mvecp) // 17
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51"
|
||||
);
|
||||
}
|
||||
126
kernel/power/zaxpy_power10.c
Normal file
126
kernel/power/zaxpy_power10.c
Normal file
@@ -0,0 +1,126 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zaxpy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_4
|
||||
|
||||
static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
BLASLONG register ix = 0;
|
||||
|
||||
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
|
||||
y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
|
||||
#else
|
||||
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
|
||||
y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
|
||||
#endif
|
||||
|
||||
ix+=4 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
{
|
||||
zaxpy_kernel_4 (n1, x, y, da_r, da_i);
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
#else
|
||||
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
#endif
|
||||
i++ ;
|
||||
ix += 2;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
inc_x *=2;
|
||||
inc_y *=2;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
#if !defined(CONJ)
|
||||
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
|
||||
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
|
||||
#else
|
||||
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
|
||||
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
|
||||
#endif
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zcopy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
|
||||
134
kernel/power/zcopy_microk_power10.c
Normal file
134
kernel/power/zcopy_microk_power10.c
Normal file
@@ -0,0 +1,134 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static void zcopy_kernel_32 (long n, double *x, double *y)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"lxvp 54, 352(%2) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"lxvp 62, 480(%2) \n\t"
|
||||
"addi %2, %2, 512 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
"lxvp 54, 352(%2) \n\t"
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
"lxvp 62, 480(%2) \n\t"
|
||||
|
||||
"addi %3, %3, 512 \n\t"
|
||||
"addi %2, %2, 512 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3"
|
||||
:
|
||||
"=m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
|
||||
);
|
||||
}
|
||||
132
kernel/power/zcopy_power10.c
Normal file
132
kernel/power/zcopy_power10.c
Normal file
@@ -0,0 +1,132 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zcopy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG i=0;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
|
||||
while ( i<n )
|
||||
{
|
||||
|
||||
f0 = x1[0];
|
||||
f1 = x1[1];
|
||||
f2 = x1[2];
|
||||
f3 = x1[3];
|
||||
f4 = x1[4];
|
||||
f5 = x1[5];
|
||||
f6 = x1[6];
|
||||
f7 = x1[7];
|
||||
|
||||
y1[0] = f0;
|
||||
y1[1] = f1;
|
||||
y1[2] = f2;
|
||||
y1[3] = f3;
|
||||
y1[4] = f4;
|
||||
y1[5] = f5;
|
||||
y1[6] = f6;
|
||||
y1[7] = f7;
|
||||
|
||||
x1 += 8;
|
||||
y1 += 8;
|
||||
|
||||
i+=4;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
zcopy_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
ix=n1*2;
|
||||
iy=n1*2;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[iy] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix+=2;
|
||||
iy+=2;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_y2 = 2 * inc_y;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
y[iy] = x[ix] ;
|
||||
y[iy+1] = x[ix+1] ;
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zdot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
@@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
|
||||
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
|
||||
|
||||
if ( n <= 0 )
|
||||
{
|
||||
{ /*
|
||||
__real__ result = 0.0 ;
|
||||
__imag__ result = 0.0 ;
|
||||
*/
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
|
||||
return(result);
|
||||
|
||||
}
|
||||
@@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
|
||||
}
|
||||
|
||||
#if !defined(CONJ)
|
||||
/*
|
||||
__real__ result = dot[0] - dot[1];
|
||||
__imag__ result = dot[2] + dot[3];
|
||||
*/
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
|
||||
#else
|
||||
/*
|
||||
__real__ result = dot[0] + dot[1];
|
||||
__imag__ result = dot[2] - dot[3];
|
||||
*/
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
245
kernel/power/zgemm_kernel_power10.S
Normal file
245
kernel/power/zgemm_kernel_power10.S
Normal file
@@ -0,0 +1,245 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
#define LOAD ld
|
||||
|
||||
#define STACKSIZE 512
|
||||
|
||||
#define FZERO 312+192(SP)
|
||||
|
||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
|
||||
|
||||
|
||||
#define o0 0
|
||||
#define alpha_r vs62
|
||||
#define alpha_i vs63
|
||||
|
||||
#define VECSAVE r11
|
||||
|
||||
#define FRAMEPOINTER r12
|
||||
|
||||
#define T10 r14
|
||||
|
||||
#define L r15
|
||||
#define T8 r16
|
||||
#define T5 r17
|
||||
#define T2 r19
|
||||
#define TEMP_REG r20
|
||||
#define T6 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define T7 r27
|
||||
#define T3 r28
|
||||
#define T4 r29
|
||||
|
||||
#define PRE r30
|
||||
#define T1 r31
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
mr FRAMEPOINTER, SP
|
||||
addi SP, SP, -STACKSIZE
|
||||
mflr r0
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
xxspltd alpha_r,vs1,0 /*copy from register f1 */
|
||||
xxspltd alpha_i,vs2,0 /*copy from register f2 */
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
|
||||
|
||||
stxv vs20, 288(SP)
|
||||
stxv vs21, 304(SP)
|
||||
stxv vs22, 320(SP)
|
||||
stxv vs23, 336(SP)
|
||||
stxv vs24, 352(SP)
|
||||
stxv vs25, 368(SP)
|
||||
stxv vs26, 384(SP)
|
||||
stxv vs27, 400(SP)
|
||||
stxv vs28, 416(SP)
|
||||
stxv vs29, 432(SP)
|
||||
stxv vs30, 448(SP)
|
||||
stxv vs31, 464(SP)
|
||||
|
||||
std r0, FLINK_SAVE(SP)
|
||||
|
||||
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#include "zgemm_macros_power10.S"
|
||||
|
||||
|
||||
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
li PRE, 512
|
||||
li r0, 0
|
||||
|
||||
|
||||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
|
||||
/*negate for this case as we will use addition -1*(a+b) */
|
||||
xvnegdp alpha_r,alpha_r
|
||||
xvnegdp alpha_i,alpha_i
|
||||
#endif
|
||||
.align 4
|
||||
|
||||
#include "zgemm_logic_power10.S"
|
||||
|
||||
L999:
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
|
||||
ld r0, FLINK_SAVE(SP)
|
||||
|
||||
lxv vs20, 288(SP)
|
||||
lxv vs21, 304(SP)
|
||||
lxv vs22, 320(SP)
|
||||
lxv vs23, 336(SP)
|
||||
lxv vs24, 352(SP)
|
||||
lxv vs25, 368(SP)
|
||||
lxv vs26, 384(SP)
|
||||
lxv vs27, 400(SP)
|
||||
mtlr r0
|
||||
lxv vs28, 416(SP)
|
||||
lxv vs29, 432(SP)
|
||||
lxv vs30, 448(SP)
|
||||
lxv vs31, 464(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user