loongarch: Optimizing the performance of the GEMM on servers
This commit is contained in:
parent
b1e8ba5017
commit
d8c4ea8793
|
@ -96,6 +96,32 @@ static inline int WhereAmI(void){
|
|||
}
|
||||
#endif
|
||||
|
||||
static inline int get_cpu_model(char *model_name) {
|
||||
FILE *cpuinfo_file = fopen("/proc/cpuinfo", "r");
|
||||
if (!cpuinfo_file) {
|
||||
return 0;
|
||||
}
|
||||
char line[1024];
|
||||
while (fgets(line, sizeof(line), cpuinfo_file)) {
|
||||
if (strstr(line, "model name")) {
|
||||
char *token = strtok(line, ":");
|
||||
token = strtok(NULL, ":");
|
||||
while (*token == ' ')
|
||||
token++;
|
||||
char *end = token + strlen(token) - 1;
|
||||
while (end > token && (*end == '\n' || *end == '\r')) {
|
||||
*end = '\0';
|
||||
end--;
|
||||
}
|
||||
strcpy(model_name, token);
|
||||
fclose(cpuinfo_file);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
fclose(cpuinfo_file);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
|
||||
#else
|
||||
|
|
|
@ -484,6 +484,14 @@ blas_queue_t *tscq;
|
|||
main_status[cpu] = MAIN_RUNNING1;
|
||||
#endif
|
||||
|
||||
//For Loongson servers, like the 3C5000 (featuring 16 cores), applying an
|
||||
//offset to the buffer is essential for minimizing cache conflicts and optimizing performance.
|
||||
#if defined(LOONGSON3R5) && !defined(NO_AFFINITY)
|
||||
char model_name[128];
|
||||
get_cpu_model(model_name);
|
||||
if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL))
|
||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);
|
||||
#endif
|
||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
|
||||
if (sb == NULL) {
|
||||
|
@ -1006,7 +1014,7 @@ void goto_set_num_threads(int num_threads) {
|
|||
|
||||
blas_cpu_number = num_threads;
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
#if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
|
|
|
@ -113,7 +113,7 @@ void goto_set_num_threads(int num_threads) {
|
|||
blas_cpu_number = num_threads;
|
||||
|
||||
adjust_thread_buffers();
|
||||
#if defined(ARCH_MIPS64)
|
||||
#if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64)
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
|
|
@ -1219,7 +1219,7 @@ UNLOCK_COMMAND(&alloc_lock);
|
|||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
@ -2814,7 +2814,7 @@ void *blas_memory_alloc(int procpos){
|
|||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
|
|
@ -739,6 +739,100 @@ void blas_set_parameter(void){
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_LOONGARCH64)
|
||||
int get_L3_size() {
|
||||
int ret = 0, id = 0x14;
|
||||
__asm__ volatile (
|
||||
"cpucfg %[ret], %[id]"
|
||||
: [ret]"=r"(ret)
|
||||
: [id]"r"(id)
|
||||
: "memory"
|
||||
);
|
||||
return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB
|
||||
}
|
||||
|
||||
void blas_set_parameter(void){
|
||||
#if defined(LOONGSON3R5)
|
||||
int L3_size = get_L3_size();
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1){
|
||||
#endif
|
||||
//single thread
|
||||
if (L3_size == 32){ // 3C5000 and 3D5000
|
||||
sgemm_p = 256;
|
||||
sgemm_q = 384;
|
||||
sgemm_r = 8192;
|
||||
|
||||
dgemm_p = 112;
|
||||
dgemm_q = 289;
|
||||
dgemm_r = 4096;
|
||||
|
||||
cgemm_p = 128;
|
||||
cgemm_q = 256;
|
||||
cgemm_r = 4096;
|
||||
|
||||
zgemm_p = 128;
|
||||
zgemm_q = 128;
|
||||
zgemm_r = 2048;
|
||||
} else { // 3A5000 and 3C5000L
|
||||
sgemm_p = 256;
|
||||
sgemm_q = 384;
|
||||
sgemm_r = 4096;
|
||||
|
||||
dgemm_p = 112;
|
||||
dgemm_q = 300;
|
||||
dgemm_r = 3024;
|
||||
|
||||
cgemm_p = 128;
|
||||
cgemm_q = 256;
|
||||
cgemm_r = 2048;
|
||||
|
||||
zgemm_p = 128;
|
||||
zgemm_q = 128;
|
||||
zgemm_r = 1024;
|
||||
}
|
||||
#ifdef SMP
|
||||
}else{
|
||||
//multi thread
|
||||
if (L3_size == 32){ // 3C5000 and 3D5000
|
||||
sgemm_p = 256;
|
||||
sgemm_q = 384;
|
||||
sgemm_r = 1024;
|
||||
|
||||
dgemm_p = 112;
|
||||
dgemm_q = 289;
|
||||
dgemm_r = 342;
|
||||
|
||||
cgemm_p = 128;
|
||||
cgemm_q = 256;
|
||||
cgemm_r = 512;
|
||||
|
||||
zgemm_p = 128;
|
||||
zgemm_q = 128;
|
||||
zgemm_r = 512;
|
||||
} else { // 3A5000 and 3C5000L
|
||||
sgemm_p = 256;
|
||||
sgemm_q = 384;
|
||||
sgemm_r = 2048;
|
||||
|
||||
dgemm_p = 112;
|
||||
dgemm_q = 300;
|
||||
dgemm_r = 738;
|
||||
|
||||
cgemm_p = 128;
|
||||
cgemm_q = 256;
|
||||
cgemm_r = 1024;
|
||||
|
||||
zgemm_p = 128;
|
||||
zgemm_q = 128;
|
||||
zgemm_r = 1024;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_ARM64)
|
||||
|
||||
void blas_set_parameter(void)
|
||||
|
|
|
@ -521,7 +521,18 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
|
||||
buffer = (XFLOAT *)blas_memory_alloc(0);
|
||||
|
||||
//For Loongson servers, like the 3C5000 (featuring 16 cores), applying an
|
||||
//offset to the buffer is essential for minimizing cache conflicts and optimizing performance.
|
||||
#if defined(LOONGSON3R5) && !defined(NO_AFFINITY)
|
||||
char model_name[128];
|
||||
get_cpu_model(model_name);
|
||||
if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL))
|
||||
sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);
|
||||
else
|
||||
sa = (XFLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
#else
|
||||
sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
|
||||
#endif
|
||||
sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
|
||||
#ifdef SMP
|
||||
|
|
|
@ -1066,31 +1066,123 @@ static void init_parameter(void) {
|
|||
}
|
||||
#else // (ARCH_MIPS64)
|
||||
#if (ARCH_LOONGARCH64)
|
||||
static int get_L3_size() {
|
||||
int ret = 0, id = 0x14;
|
||||
__asm__ volatile (
|
||||
"cpucfg %[ret], %[id]"
|
||||
: [ret]"=r"(ret)
|
||||
: [id]"r"(id)
|
||||
: "memory"
|
||||
);
|
||||
return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB
|
||||
}
|
||||
static void init_parameter(void) {
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3R5)
|
||||
int L3_size = get_L3_size();
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1){
|
||||
#endif
|
||||
//single thread
|
||||
if (L3_size == 32){ // 3C5000 and 3D5000
|
||||
TABLE_NAME.sgemm_p = 256;
|
||||
TABLE_NAME.sgemm_q = 384;
|
||||
TABLE_NAME.sgemm_r = 8192;
|
||||
|
||||
TABLE_NAME.dgemm_p = 112;
|
||||
TABLE_NAME.dgemm_q = 289;
|
||||
TABLE_NAME.dgemm_r = 4096;
|
||||
|
||||
TABLE_NAME.cgemm_p = 128;
|
||||
TABLE_NAME.cgemm_q = 256;
|
||||
TABLE_NAME.cgemm_r = 4096;
|
||||
|
||||
TABLE_NAME.zgemm_p = 128;
|
||||
TABLE_NAME.zgemm_q = 128;
|
||||
TABLE_NAME.zgemm_r = 2048;
|
||||
} else { // 3A5000 and 3C5000L
|
||||
TABLE_NAME.sgemm_p = 256;
|
||||
TABLE_NAME.sgemm_q = 384;
|
||||
TABLE_NAME.sgemm_r = 4096;
|
||||
|
||||
TABLE_NAME.dgemm_p = 112;
|
||||
TABLE_NAME.dgemm_q = 300;
|
||||
TABLE_NAME.dgemm_r = 3024;
|
||||
|
||||
TABLE_NAME.cgemm_p = 128;
|
||||
TABLE_NAME.cgemm_q = 256;
|
||||
TABLE_NAME.cgemm_r = 2048;
|
||||
|
||||
TABLE_NAME.zgemm_p = 128;
|
||||
TABLE_NAME.zgemm_q = 128;
|
||||
TABLE_NAME.zgemm_r = 1024;
|
||||
}
|
||||
#ifdef SMP
|
||||
}else{
|
||||
//multi thread
|
||||
if (L3_size == 32){ // 3C5000 and 3D5000
|
||||
TABLE_NAME.sgemm_p = 256;
|
||||
TABLE_NAME.sgemm_q = 384;
|
||||
TABLE_NAME.sgemm_r = 1024;
|
||||
|
||||
TABLE_NAME.dgemm_p = 112;
|
||||
TABLE_NAME.dgemm_q = 289;
|
||||
TABLE_NAME.dgemm_r = 342;
|
||||
|
||||
TABLE_NAME.cgemm_p = 128;
|
||||
TABLE_NAME.cgemm_q = 256;
|
||||
TABLE_NAME.cgemm_r = 512;
|
||||
|
||||
TABLE_NAME.zgemm_p = 128;
|
||||
TABLE_NAME.zgemm_q = 128;
|
||||
TABLE_NAME.zgemm_r = 512;
|
||||
} else { // 3A5000 and 3C5000L
|
||||
TABLE_NAME.sgemm_p = 256;
|
||||
TABLE_NAME.sgemm_q = 384;
|
||||
TABLE_NAME.sgemm_r = 2048;
|
||||
|
||||
TABLE_NAME.dgemm_p = 112;
|
||||
TABLE_NAME.dgemm_q = 300;
|
||||
TABLE_NAME.dgemm_r = 738;
|
||||
|
||||
TABLE_NAME.cgemm_p = 128;
|
||||
TABLE_NAME.cgemm_q = 256;
|
||||
TABLE_NAME.cgemm_r = 1024;
|
||||
|
||||
TABLE_NAME.zgemm_p = 128;
|
||||
TABLE_NAME.zgemm_q = 128;
|
||||
TABLE_NAME.zgemm_r = 1024;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
|
||||
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
|
||||
TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
|
||||
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
|
||||
|
||||
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
|
||||
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
|
||||
TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
|
||||
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
}
|
||||
#else // (ARCH_LOONGARCH64)
|
||||
#if (ARCH_POWER)
|
||||
|
|
20
param.h
20
param.h
|
@ -2836,7 +2836,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_A 0x20000
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
|
||||
|
@ -2866,20 +2866,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define QGEMM_DEFAULT_UNROLL_M 2
|
||||
#define XGEMM_DEFAULT_UNROLL_M 1
|
||||
|
||||
#define SGEMM_DEFAULT_P 256
|
||||
#define DGEMM_DEFAULT_P 32
|
||||
#define SGEMM_DEFAULT_P sgemm_p
|
||||
#define DGEMM_DEFAULT_P dgemm_p
|
||||
#define CGEMM_DEFAULT_P 128
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
#define ZGEMM_DEFAULT_P zgemm_p
|
||||
|
||||
#define SGEMM_DEFAULT_R 1024
|
||||
#define DGEMM_DEFAULT_R 858
|
||||
#define SGEMM_DEFAULT_R sgemm_r
|
||||
#define DGEMM_DEFAULT_R dgemm_r
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R zgemm_r
|
||||
|
||||
#define SGEMM_DEFAULT_Q 256
|
||||
#define DGEMM_DEFAULT_Q 152
|
||||
#define SGEMM_DEFAULT_Q sgemm_q
|
||||
#define DGEMM_DEFAULT_Q dgemm_q
|
||||
#define CGEMM_DEFAULT_Q 128
|
||||
#define ZGEMM_DEFAULT_Q 128
|
||||
#define ZGEMM_DEFAULT_Q zgemm_q
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue