Merge branch 'develop' into arm64_cmake_small_matrix_opt

This commit is contained in:
Martin Kroeker 2024-10-03 20:04:52 +02:00 committed by GitHub
commit b4495a8fb8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 611 additions and 155 deletions

View File

@ -23,6 +23,15 @@ jobs:
- target: LOONGSON2K1000 - target: LOONGSON2K1000
triple: loongarch64-unknown-linux-gnu triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
- target: LA64_GENERIC
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
- target: LA464
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
- target: LA264
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
- target: DYNAMIC_ARCH - target: DYNAMIC_ARCH
triple: loongarch64-unknown-linux-gnu triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC

View File

@ -20,6 +20,12 @@ jobs:
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
- target: LOONGSON2K1000 - target: LOONGSON2K1000
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
- target: LA64_GENERIC
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
- target: LA464
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
- target: LA264
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
- target: DYNAMIC_ARCH - target: DYNAMIC_ARCH
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC

View File

@ -14,6 +14,9 @@ endif
ifeq ($(INTERFACE64),1) ifeq ($(INTERFACE64),1)
USE_64BITINT=1 USE_64BITINT=1
endif endif
ifeq ($(USE_OPENMP),1)
FOMP_OPT:= -fopenmp
endif
PREFIX ?= /opt/OpenBLAS PREFIX ?= /opt/OpenBLAS
@ -178,6 +181,7 @@ endif
@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
@echo 'omp_opt='$(FOMP_OPT) >> "$(PKGFILE)"
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
@echo 'version='$(VERSION) >> "$(PKGFILE)" @echo 'version='$(VERSION) >> "$(PKGFILE)"
@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"

View File

@ -727,7 +727,7 @@ endif
endif endif
ifeq ($(ARCH), loongarch64) ifeq ($(ARCH), loongarch64)
DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC DYNAMIC_CORE = LA64_GENERIC LA264 LA464
endif endif
ifeq ($(ARCH), riscv64) ifeq ($(ARCH), riscv64)
@ -1720,8 +1720,8 @@ LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx
override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
endif endif
ifeq ($(F_COMPILER),FLANGNEW) ifeq ($(F_COMPILER),FLANGNEW)
LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS))
override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS))
endif endif
LAPACK_CFLAGS = $(CFLAGS) LAPACK_CFLAGS = $(CFLAGS)

View File

@ -126,9 +126,17 @@ x280
RISCV64_ZVL256B RISCV64_ZVL256B
11.LOONGARCH64: 11.LOONGARCH64:
// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 are legacy names,
// and it is recommended to use the more standardized naming conventions
// LA64_GENERIC/LA264/LA464. You can still specify TARGET as
// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 during compilation or runtime,
// and they will be internally relocated to LA64_GENERIC/LA264/LA464.
LOONGSONGENERIC LOONGSONGENERIC
LOONGSON3R5
LOONGSON2K1000 LOONGSON2K1000
LOONGSON3R5
LA64_GENERIC
LA264
LA464
12. Elbrus E2000: 12. Elbrus E2000:
E2K E2K

View File

@ -407,13 +407,13 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
float *c, OPENBLAS_CONST blasint cldc); float *c, OPENBLAS_CONST blasint cldc);
void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
double *c, OPENBLAS_CONST blasint cldc); double *c, OPENBLAS_CONST blasint cldc);
void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
float *c, OPENBLAS_CONST blasint cldc); float *c, OPENBLAS_CONST blasint cldc);
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
double *c, OPENBLAS_CONST blasint cldc); double *c, OPENBLAS_CONST blasint cldc);
void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,

View File

@ -94,6 +94,10 @@ if (DYNAMIC_ARCH)
endif () endif ()
endif () endif ()
if (LOONGARCH64)
set(DYNAMIC_CORE LOONGSONGENERIC LOONGSON2K1000 LOONGSON3R5)
endif ()
if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h)
message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again") message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again")
endif () endif ()

View File

@ -61,21 +61,25 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
endif () endif ()
if (LOONGARCH64) if (LOONGARCH64)
if (BINARY64) if (BINARY64)
CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
if(COMPILER_SUPPORT_LP64D_ABI) CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") if(COMPILER_SUPPORT_LP64D_ABI)
else() set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") else()
endif () set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
endif ()
endif ()
if (INTERFACE64) if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif () endif ()
else () else ()
CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
if(COMPILER_SUPPORT_ILP32D_ABI) CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") if(COMPILER_SUPPORT_ILP32D_ABI)
else() set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") else()
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
endif ()
endif () endif ()
endif () endif ()
endif () endif ()

View File

@ -9,5 +9,5 @@ Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OpenBLAS_VERSION@ Version: @OpenBLAS_VERSION@
URL: https://github.com/OpenMathLib/OpenBLAS URL: https://github.com/OpenMathLib/OpenBLAS
Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} Libs: -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix}
Cflags: -I${includedir} Cflags: -I${includedir} @OpenMP_C_FLAGS@

View File

@ -1349,6 +1349,54 @@ endif ()
"#define DTB_DEFAULT_ENTRIES 128\n" "#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n" "#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 4\n") "#define L2_ASSOCIATIVE 4\n")
elseif ("${TCORE}" STREQUAL "LOONGSONGENERIC")
file(APPEND ${TARGET_CONF_TEMP}
"#define DTB_DEFAULT_ENTRIES 64\n")
set(SGEMM_UNROLL_M 2)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 8)
set(CGEMM_UNROLL_M 1)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 1)
set(ZGEMM_UNROLL_N 4)
set(CGEMM3M_UNROLL_M 2)
set(CGEMM3M_UNROLL_N 8)
set(ZGEMM3M_UNROLL_M 2)
set(ZGEMM3M_UNROLL_N 8)
elseif ("${TCORE}" STREQUAL "LOONGSON2K1000")
file(APPEND ${TARGET_CONF_TEMP}
"#define DTB_DEFAULT_ENTRIES 64\n")
set(HAVE_LSX 1)
set(SGEMM_UNROLL_M 2)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(CGEMM3M_UNROLL_M 2)
set(CGEMM3M_UNROLL_N 8)
set(ZGEMM3M_UNROLL_M 8)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "LOONGSON3R5")
file(APPEND ${TARGET_CONF_TEMP}
"#define DTB_DEFAULT_ENTRIES 64\n")
set(HAVE_LASX 1)
set(HAVE_LSX 1)
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 6)
set(CGEMM_UNROLL_M 16)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 4)
set(CGEMM3M_UNROLL_M 16)
set(CGEMM3M_UNROLL_N 8)
set(ZGEMM3M_UNROLL_M 16)
set(ZGEMM3M_UNROLL_N 6)
endif() endif()
set(SBGEMM_UNROLL_M 8) set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4) set(SBGEMM_UNROLL_N 4)

View File

@ -388,7 +388,7 @@ if (NEED_PIC)
endif() endif()
endif () endif ()
if (X86_64 OR ${CORE} STREQUAL POWER10 OR ARM64) if (X86_64 OR ${CORE} STREQUAL POWER10 OR ARM64 OR LOONGARCH64)
set(SMALL_MATRIX_OPT TRUE) set(SMALL_MATRIX_OPT TRUE)
endif () endif ()
if (ARM64) if (ARM64)
@ -403,7 +403,7 @@ if (SMALL_MATRIX_OPT)
endif () endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64) if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64 OR LOONGARCH64)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER) if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")

View File

@ -104,6 +104,8 @@ elseif(ARM)
set(ARCH "arm") set(ARCH "arm")
elseif(ARM64) elseif(ARM64)
set(ARCH "arm64") set(ARCH "arm64")
elseif(LOONGARCH64)
set(ARCH "loongarch64")
else() else()
set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture") set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture")
endif () endif ()

View File

@ -281,9 +281,13 @@ REALNAME: ;\
#define GNUSTACK #define GNUSTACK
#endif /* defined(__linux__) && defined(__ELF__) */ #endif /* defined(__linux__) && defined(__ELF__) */
#ifdef __clang__
#define EPILOGUE .end
#else
#define EPILOGUE \ #define EPILOGUE \
.end REALNAME ;\ .end REALNAME ;\
GNUSTACK GNUSTACK
#endif
#define PROFCODE #define PROFCODE

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011-2020, The OpenBLAS Project Copyright (c) 2011-2024, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -32,53 +32,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
#include <stdint.h> #include <stdint.h>
#include <sys/auxv.h>
#include <stdio.h> #include <stdio.h>
#include <math.h>
#include <string.h>
#include <sys/auxv.h>
/* If LASX extension instructions supported, #define CPU_LA64_GENERIC 0
* using core LOONGSON3R5 #define CPU_LA264 1
* If only LSX extension instructions supported, #define CPU_LA364 2
* using core LOONGSON2K1000 #define CPU_LA464 3
* If neither LASX nor LSX extension instructions supported, #define CPU_LA664 4
* using core LOONGSONGENERIC (As far as I know, there is no such
* CPU yet)
*/
#define CPU_GENERIC 0 #define CORE_LA64_GENERIC 0
#define CPU_LOONGSON3R5 1 #define CORE_LA264 1
#define CPU_LOONGSON2K1000 2 #define CORE_LA464 2
#define LA_HWCAP_LSX (1U << 4) #define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5) #define LA_HWCAP_LASX (1U << 5)
#define LOONGARCH_CFG0 0x00
#define LOONGARCH_CFG2 0x02
#define LOONGARCH_CFG10 0x10
#define LOONGARCH_CFG11 0x11
#define LOONGARCH_CFG12 0x12
#define LOONGARCH_CFG13 0x13
#define LOONGARCH_CFG14 0x14
#define LASX_MASK 1<<7
#define LSX_MASK 1<<6
#define PRID_SERIES_MASK 0xf000
#define PRID_SERIES_LA264 0xa000
#define PRID_SERIES_LA364 0xb000
#define PRID_SERIES_LA464 0xc000
#define PRID_SERIES_LA664 0xd000
#define CACHE_INFO_L1_IU 0
#define CACHE_INFO_L1_D 1
#define CACHE_INFO_L2_IU 2
#define CACHE_INFO_L2_D 3
#define CACHE_INFO_L3_IU 4
#define CACHE_INFO_L3_D 5
#define L1_IU_PRESENT_MASK 0x0001
#define L1_IU_UNITY_MASK 0x0002
#define L1_D_PRESENT_MASK 0x0004
#define L2_IU_PRESENT_MASK 0x0008
#define L2_IU_UNITY_MASK 0x0010
#define L2_D_PRESENT_MASK 0x0080
#define L3_IU_PRESENT_MASK 0x0400
#define L3_IU_UNITY_MASK 0x0800
#define L3_D_PRESENT_MASK 0x4000
#define CACHE_WAY_MINUS_1_MASK 0x0000ffff
#define CACHE_INDEX_LOG2_MASK 0x00ff0000
#define CACHE_LINESIZE_LOG2_MASK 0x7f000000
typedef struct {
int size;
int associative;
int linesize;
int unify;
int present;
} cache_info_t;
/* Using microarchitecture representation */
static char *cpuname[] = { static char *cpuname[] = {
"LOONGSONGENERIC", "LA64_GENERIC",
"LOONGSON3R5", "LA264", /* Loongson 64bit, 2-issue, Like 2K1000LA */
"LOONGSON2K1000" "LA364", /* Loongson 64bit, 3-issue, Like 2K2000 */
"LA464", /* Loongson 64bit, 4-issue, Like 3A5000, 3C5000L, 3C5000 and 3D5000 */
"LA664" /* Loongson 64bit, 6-issue, Like 3A6000, 3C6000 and 3D6000 */
}; };
static char *cpuname_lower[] = { static char *cpuname_lower[] = {
"loongsongeneric", "la64_generic",
"loongson3r5", "la264",
"loongson2k1000" "la364",
"la464",
"la664"
}; };
int detect(void) { static char *corename[] = {
#ifdef __linux "LA64_GENERIC", /* Implies using scalar instructions for optimization */
"LA264", /* Implies using LSX instructions for optimization */
"LA464", /* Implies using LASX instructions for optimization */
};
static char *corename_lower[] = {
"la64_generic",
"la264",
"la464",
};
/*
* Obtain cache and processor identification
* through the cpucfg command.
*/
static void get_cacheinfo(int type, cache_info_t *cacheinfo) {
cache_info_t cache_info;
memset(&cache_info, 0, sizeof(cache_info));
uint32_t reg_10 = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_10)
: "r"(LOONGARCH_CFG10)
);
switch (type) {
case CACHE_INFO_L1_IU:
if (reg_10 & L1_IU_PRESENT_MASK) {
uint32_t reg_11 = 0;
cache_info.present = reg_10 & L1_IU_PRESENT_MASK;
cache_info.unify = reg_10 & L1_IU_UNITY_MASK;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_11)
: "r"(LOONGARCH_CFG11)
);
cache_info.associative = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1;
cache_info.linesize = 1 << ((reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24);
cache_info.size = cache_info.associative * cache_info.linesize *
(1 << ((reg_11 & CACHE_INDEX_LOG2_MASK) >> 16));
}
break;
case CACHE_INFO_L1_D:
if (reg_10 & L1_D_PRESENT_MASK) {
uint32_t reg_12 = 0;
cache_info.present = reg_10 & L1_D_PRESENT_MASK;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_12)
: "r"(LOONGARCH_CFG12)
);
cache_info.associative = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1;
cache_info.linesize = 1 << ((reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24);
cache_info.size = cache_info.associative * cache_info.linesize *
(1 << ((reg_12 & CACHE_INDEX_LOG2_MASK) >> 16));
}
break;
case CACHE_INFO_L2_IU:
if (reg_10 & L2_IU_PRESENT_MASK) {
uint32_t reg_13 = 0;
cache_info.present = reg_10 & L2_IU_PRESENT_MASK;
cache_info.unify = reg_10 & L2_IU_UNITY_MASK;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_13)
: "r"(LOONGARCH_CFG13)
);
cache_info.associative = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1;
cache_info.linesize = 1 << ((reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24);
cache_info.size = cache_info.associative * cache_info.linesize *
(1 << ((reg_13 & CACHE_INDEX_LOG2_MASK) >> 16));
}
break;
case CACHE_INFO_L2_D:
if (reg_10 & L2_D_PRESENT_MASK) {
cache_info.present = reg_10 & L2_D_PRESENT_MASK;
// No date fetch
}
break;
case CACHE_INFO_L3_IU:
if (reg_10 & L3_IU_PRESENT_MASK) {
uint32_t reg_14 = 0;
cache_info.present = reg_10 & L3_IU_PRESENT_MASK;
cache_info.unify = reg_10 & L3_IU_UNITY_MASK;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg_14)
: "r"(LOONGARCH_CFG14)
);
cache_info.associative = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1;
cache_info.linesize = 1 << ((reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24);
cache_info.size = cache_info.associative * cache_info.linesize *
(1 << ((reg_14 & CACHE_INDEX_LOG2_MASK) >> 16));
}
break;
case CACHE_INFO_L3_D:
if (reg_10 & L3_D_PRESENT_MASK) {
cache_info.present = reg_10 & L3_D_PRESENT_MASK;
// No data fetch
}
break;
default:
break;
}
*cacheinfo = cache_info;
}
static uint32_t get_prid() {
uint32_t reg = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG0)
);
return reg;
}
static void get_cpucount(uint32_t *count) {
uint32_t num = 0;
FILE *f = fopen("/proc/cpuinfo", "r");
if (!f) return;
char buf[200];
while (fgets(buf, sizeof(buf), f))
{
if (!strncmp("processor", buf, 9))
num ++;
}
fclose(f);
*count = num;
}
/* Detect whether the OS supports the LASX instruction set */
static int os_support_lasx() {
int hwcap = (int)getauxval(AT_HWCAP); int hwcap = (int)getauxval(AT_HWCAP);
if (hwcap & LA_HWCAP_LASX) if (hwcap & LA_HWCAP_LASX)
return CPU_LOONGSON3R5; return 1;
else if (hwcap & LA_HWCAP_LSX)
return CPU_LOONGSON2K1000;
else else
return CPU_GENERIC; return 0;
#endif }
return CPU_GENERIC;
/* Detect whether the OS supports the LSX instruction set */
static int os_support_lsx() {
int hwcap = (int)getauxval(AT_HWCAP);
if (hwcap & LA_HWCAP_LSX)
return 1;
else
return 0;
}
int get_coretype(void) {
uint32_t prid = get_prid();
switch (prid & PRID_SERIES_MASK) {
case (PRID_SERIES_LA464):
case (PRID_SERIES_LA664):
if (os_support_lasx())
return CORE_LA464;
else if (os_support_lsx())
return CORE_LA264;
else
return CORE_LA64_GENERIC;
break;
case (PRID_SERIES_LA264):
case (PRID_SERIES_LA364):
if (os_support_lsx())
return CORE_LA264;
else
return CORE_LA64_GENERIC;
break;
default:
return CORE_LA64_GENERIC;
break;
}
}
int get_cputype(void) {
uint32_t prid = get_prid();
switch (prid & PRID_SERIES_MASK) {
case (PRID_SERIES_LA264):
return CPU_LA264;
break;
case (PRID_SERIES_LA364):
return CPU_LA364;
break;
case (PRID_SERIES_LA464):
return CPU_LA464;
break;
case (PRID_SERIES_LA664):
return CPU_LA664;
break;
default:
return CPU_LA64_GENERIC;
break;
}
} }
char *get_corename(void) { char *get_corename(void) {
return cpuname[detect()]; return corename[get_coretype()];
}
void get_libname(void){
printf("%s", corename_lower[get_coretype()]);
} }
void get_architecture(void) { void get_architecture(void) {
@ -86,8 +332,7 @@ void get_architecture(void) {
} }
void get_subarchitecture(void) { void get_subarchitecture(void) {
int d = detect(); printf("%s", cpuname[get_cputype()]);
printf("%s", cpuname[d]);
} }
void get_subdirname(void) { void get_subdirname(void) {
@ -95,50 +340,69 @@ void get_subdirname(void) {
} }
void get_cpuconfig(void) { void get_cpuconfig(void) {
uint32_t hwcaps = 0; cache_info_t info;
int d = detect(); uint32_t num_cores = 0;
switch (d) { printf("#define %s\n", corename[get_coretype()]); // Core name
case CPU_LOONGSON3R5:
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_LOONGSON2K1000: printf("#define CPU_NAME %s\n", cpuname[get_cputype()]); // Cpu microarchitecture name
printf("#define LOONGSON2K1000\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
default: get_cacheinfo(CACHE_INFO_L1_IU, &info);
printf("#define LOONGSONGENERIC\n"); if (info.present) {
printf("#define L1_DATA_SIZE 65536\n"); if (info.unify) { // Unified cache, without distinguishing between instructions and data
printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_SIZE %d\n", info.size);
printf("#define L2_SIZE 262144\n"); printf("#define L1_ASSOCIATIVE %d\n", info.associative);
printf("#define L2_LINESIZE 64\n"); printf("#define L1_LINESIZE %d\n", info.linesize);
printf("#define DTB_DEFAULT_ENTRIES 64\n"); } else {
printf("#define DTB_SIZE 4096\n"); printf("#define L1_CODE_SIZE %d\n", info.size);
printf("#define L2_ASSOCIATIVE 16\n"); printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative);
break; printf("#define L1_CODE_LINESIZE %d\n", info.linesize);
}
} }
hwcaps = (uint32_t)getauxval( AT_HWCAP ); if (!info.unify) {
if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n"); get_cacheinfo(CACHE_INFO_L1_D, &info);
if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n"); if (info.present) {
} printf("#define L1_DATA_SIZE %d\n", info.size);
printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative);
printf("#define L1_DATA_LINESIZE %d\n", info.linesize);
}
}
void get_libname(void){ get_cacheinfo(CACHE_INFO_L2_IU, &info);
int d = detect(); if (info.present > 0) {
printf("%s", cpuname_lower[d]); if (info.unify) {
printf("#define L2_SIZE %d\n", info.size);
printf("#define L2_ASSOCIATIVE %d\n", info.associative);
printf("#define L2_LINESIZE %d\n", info.linesize);
} else {
printf("#define L2_CODE_SIZE %d\n", info.size);
printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative);
printf("#define L2_CODE_LINESIZE %d\n", info.linesize);
}
}
get_cacheinfo(CACHE_INFO_L3_IU, &info);
if (info.present > 0) {
if (info.unify) {
printf("#define L3_SIZE %d\n", info.size);
printf("#define L3_ASSOCIATIVE %d\n", info.associative);
printf("#define L3_LINESIZE %d\n", info.linesize);
} else {
printf("#define L3_CODE_SIZE %d\n", info.size);
printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative);
printf("#define L3_CODE_LINESIZE %d\n", info.linesize);
}
}
if(os_support_lsx) printf("#define HAVE_LSX\n");
if(os_support_lasx) printf("#define HAVE_LASX\n");
get_cpucount(&num_cores);
if (num_cores)
printf("#define NUM_CORES %d\n", num_cores);
//TODO: Its unclear what this entry represents, but it is indeed necessary.
//It has been set based on reference to other platforms.
printf("#define DTB_DEFAULT_ENTRIES 64\n");
} }

View File

@ -54,6 +54,8 @@ if (DYNAMIC_ARCH)
list(APPEND COMMON_SOURCES dynamic_power.c) list(APPEND COMMON_SOURCES dynamic_power.c)
elseif (RISCV64) elseif (RISCV64)
list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c) list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c)
elseif (LOONGARCH64)
list(APPEND COMMON_SOURCES dynamic_loongarch64.c)
else () else ()
list(APPEND COMMON_SOURCES dynamic.c) list(APPEND COMMON_SOURCES dynamic.c)
endif () endif ()

View File

@ -1082,7 +1082,7 @@ if (buffer == NULL) {
} }
//For target LOONGSON3R5, applying an offset to the buffer is essential //For LOONGARCH64, applying an offset to the buffer is essential
//for minimizing cache conflicts and optimizing performance. //for minimizing cache conflicts and optimizing performance.
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);

View File

@ -28,25 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/auxv.h> #include <sys/auxv.h>
#include "common.h" #include "common.h"
extern gotoblas_t gotoblas_LOONGSON3R5; #define NUM_CORETYPES 6
extern gotoblas_t gotoblas_LOONGSON2K1000; #define LOONGARCH_CFG0 0x00
extern gotoblas_t gotoblas_LOONGSONGENERIC; #define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5)
#define PRID_SERIES_MASK 0xf000
#define PRID_SERIES_LA264 0xa000
#define PRID_SERIES_LA364 0xb000
#define PRID_SERIES_LA464 0xc000
#define PRID_SERIES_LA664 0xd000
extern gotoblas_t gotoblas_LA64_GENERIC;
extern gotoblas_t gotoblas_LA264;
extern gotoblas_t gotoblas_LA464;
extern void openblas_warning(int verbose, const char * msg); extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 3
static char *corename[] = { static char *corename[] = {
"loongson3r5", "la64_generic",
"loongson2k1000", "la264",
"la464",
"loongsongeneric", "loongsongeneric",
"loongson2k1000",
"loongson3r5",
"unknown" "unknown"
}; };
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0]; if (gotoblas == &gotoblas_LA64_GENERIC) return corename[0];
if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1]; if (gotoblas == &gotoblas_LA264) return corename[1];
if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2]; if (gotoblas == &gotoblas_LA464) return corename[2];
return corename[NUM_CORETYPES]; return corename[NUM_CORETYPES];
} }
@ -66,27 +77,78 @@ static gotoblas_t *force_coretype(char *coretype) {
switch (found) switch (found)
{ {
case 0: return (&gotoblas_LOONGSON3R5); case 0: return (&gotoblas_LA64_GENERIC);
case 1: return (&gotoblas_LOONGSON2K1000); case 1: return (&gotoblas_LA264);
case 2: return (&gotoblas_LOONGSONGENERIC); case 2: return (&gotoblas_LA464);
case 3: return (&gotoblas_LA64_GENERIC);
case 4: return (&gotoblas_LA264);
case 5: return (&gotoblas_LA464);
} }
snprintf(message, 128, "Core not found: %s\n", coretype); snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message); openblas_warning(1, message);
return NULL; return NULL;
} }
#define LA_HWCAP_LSX (1U << 4)
#define LA_HWCAP_LASX (1U << 5)
static gotoblas_t *get_coretype(void) { /* Detect whether the OS supports the LASX instruction set */
int hwcap = (int)getauxval(AT_HWCAP); static int os_support_lasx() {
int hwcap = (int)getauxval(AT_HWCAP);
if (hwcap & LA_HWCAP_LASX) if (hwcap & LA_HWCAP_LASX)
return &gotoblas_LOONGSON3R5; return 1;
else if (hwcap & LA_HWCAP_LSX)
return &gotoblas_LOONGSON2K1000;
else else
return &gotoblas_LOONGSONGENERIC; return 0;
}
/* Detect whether the OS supports the LSX instruction set */
static int os_support_lsx() {
int hwcap = (int)getauxval(AT_HWCAP);
if (hwcap & LA_HWCAP_LSX)
return 1;
else
return 0;
}
static uint32_t get_prid() {
uint32_t reg = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG0)
);
return reg;
}
/* Select core at runtime based on the
* cpu name and SIMD instructions supported
* by the system
*/
static gotoblas_t *get_coretype(void) {
uint32_t prid = get_prid();
switch (prid & PRID_SERIES_MASK) {
case (PRID_SERIES_LA464):
case (PRID_SERIES_LA664):
if (os_support_lasx())
return &gotoblas_LA464;
else if (os_support_lsx())
return &gotoblas_LA264;
else
return &gotoblas_LA64_GENERIC;
break;
case (PRID_SERIES_LA264):
case (PRID_SERIES_LA364):
if (os_support_lsx())
return &gotoblas_LA264;
else
return &gotoblas_LA64_GENERIC;
break;
default:
return &gotoblas_LA64_GENERIC;
break;
}
} }
void gotoblas_dynamic_init(void) { void gotoblas_dynamic_init(void) {

View File

@ -752,7 +752,7 @@ int get_L3_size() {
} }
void blas_set_parameter(void){ void blas_set_parameter(void){
#if defined(LOONGSON3R5) #if defined(LA464)
int L3_size = get_L3_size(); int L3_size = get_L3_size();
#ifdef SMP #ifdef SMP
if(blas_num_threads == 1){ if(blas_num_threads == 1){

View File

@ -135,11 +135,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_CELL */ /* #define FORCE_CELL */
/* #define FORCE_MIPS64_GENERIC */ /* #define FORCE_MIPS64_GENERIC */
/* #define FORCE_SICORTEX */ /* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3R3 */ /* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */ /* #define FORCE_LOONGSON3R4 */
/* #define FORCE_LOONGSON3R5 */ /* #define FORCE_LOONGSON3R5 */
/* #define FORCE_LOONGSON2K1000 */ /* #define FORCE_LOONGSON2K1000 */
/* #define FORCE_LOONGSONGENERIC */ /* #define FORCE_LOONGSONGENERIC */
/* #define FORCE_LA64_GENERIC */
/* #define FORCE_LA264 */
/* #define FORCE_LA464 */
/* #define FORCE_I6400 */ /* #define FORCE_I6400 */
/* #define FORCE_P6600 */ /* #define FORCE_P6600 */
/* #define FORCE_P5600 */ /* #define FORCE_P5600 */
@ -153,7 +156,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_EV5 */ /* #define FORCE_EV5 */
/* #define FORCE_EV6 */ /* #define FORCE_EV6 */
/* #define FORCE_CSKY */ /* #define FORCE_CSKY */
/* #define FORCE_CK860FV */ /* #define FORCE_CK860FV */
/* #define FORCE_GENERIC */ /* #define FORCE_GENERIC */
#ifdef FORCE_P2 #ifdef FORCE_P2
@ -979,46 +982,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#endif #endif
#ifdef FORCE_LOONGSON3R5 #if defined(FORCE_LA464) || defined(FORCE_LOONGSON3R5)
#define FORCE #define FORCE
#define ARCHITECTURE "LOONGARCH" #define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSON3R5" #ifdef NO_LASX
#ifdef NO_LSX
#define SUBARCHITECTURE "LA64_GENERIC"
#define SUBDIRNAME "loongarch64" #define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSON3R5 " \ #define ARCHCONFIG "-DLA64_GENERIC " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
#define LIBNAME "loongson3r5"
#define CORENAME "LOONGSON3R5"
#else
#endif
#ifdef FORCE_LOONGSON2K1000
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSON2K1000"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSON2K1000 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" "-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "loongson2k1000" #define LIBNAME "la64_generic"
#define CORENAME "LOONGSON2K1000" #define CORENAME "LA64_GENERIC"
#else #else
#endif #define SUBARCHITECTURE "LA264"
#ifdef FORCE_LOONGSONGENERIC
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSONGENERIC"
#define SUBDIRNAME "loongarch64" #define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSONGENERIC " \ #define ARCHCONFIG "-DLA264 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" "-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "loongsongeneric" #define LIBNAME "la264"
#define CORENAME "LOONGSONGENERIC" #define CORENAME "LA264"
#endif
#else #else
#define SUBARCHITECTURE "LA464"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLA464 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la464"
#define CORENAME "LA464"
#endif
#endif
#if defined(FORCE_LA264) || defined(FORCE_LOONGSON2K1000)
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#ifdef NO_LSX
#define SUBARCHITECTURE "LA64_GENERIC"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLA64_GENERIC " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la64_generic"
#define CORENAME "LA64_GENERIC"
#else
#define SUBARCHITECTURE "LA264"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLA264 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la264"
#define CORENAME "LA264"
#endif
#endif
#if defined(FORCE_LA64_GENERIC) || defined(FORCE_LOONGSONGENERIC)
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LA64_GENERIC"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLA64_GENERIC " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 "
#define LIBNAME "la64_generic"
#define CORENAME "LA64_GENERIC"
#endif #endif
#ifdef FORCE_I6400 #ifdef FORCE_I6400

View File

@ -572,7 +572,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
buffer = (XFLOAT *)blas_memory_alloc(0); buffer = (XFLOAT *)blas_memory_alloc(0);
//For target LOONGSON3R5, applying an offset to the buffer is essential //For LOONGARCH64, applying an offset to the buffer is essential
//for minimizing cache conflicts and optimizing performance. //for minimizing cache conflicts and optimizing performance.
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);

View File

@ -0,0 +1,6 @@
include $(KERNELDIR)/KERNEL
STRMMKERNEL = gemm_kernel.S
DTRMMKERNEL = gemm_kernel.S
CTRMMKERNEL = zgemm_kernel.S
ZTRMMKERNEL = zgemm_kernel.S

View File

@ -1086,7 +1086,7 @@ static void init_parameter(void) {
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
#endif #endif
#if defined(LOONGSON3R5) #if defined(LA464)
int L3_size = get_L3_size(); int L3_size = get_L3_size();
#ifdef SMP #ifdef SMP
if(blas_num_threads == 1){ if(blas_num_threads == 1){

View File

@ -4,4 +4,4 @@ Version: ${version}
URL: https://github.com/xianyi/OpenBLAS URL: https://github.com/xianyi/OpenBLAS
Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix} Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix}
Libs.private: ${extralib} Libs.private: ${extralib}
Cflags: -I${includedir} Cflags: -I${includedir} ${omp_opt}

View File

@ -2838,7 +2838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16 #define SYMV_P 16
#endif #endif
#if defined (LOONGSON3R5) #if defined (LA464)
#define SNUMOPT 2 #define SNUMOPT 2
#define DNUMOPT 2 #define DNUMOPT 2
@ -2891,7 +2891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16 #define SYMV_P 16
#endif #endif
#ifdef LOONGSON2K1000 #ifdef LA264
#define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
@ -2926,7 +2926,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16 #define SYMV_P 16
#endif #endif
#ifdef LOONGSONGENERIC #ifdef LA64_GENERIC
#define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL