diff --git a/common_loongarch64.h b/common_loongarch64.h index b1426da79..367e5df18 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -96,6 +96,32 @@ static inline int WhereAmI(void){ } #endif +static inline int get_cpu_model(char *model_name) { + FILE *cpuinfo_file = fopen("/proc/cpuinfo", "r"); + if (!cpuinfo_file) { + return 0; + } + char line[1024]; + while (fgets(line, sizeof(line), cpuinfo_file)) { + if (strstr(line, "model name")) { + char *token = strtok(line, ":"); + token = strtok(NULL, ":"); + while (*token == ' ') + token++; + char *end = token + strlen(token) - 1; + while (end > token && (*end == '\n' || *end == '\r')) { + *end = '\0'; + end--; + } + strcpy(model_name, token); + fclose(cpuinfo_file); + return 1; + } + } + fclose(cpuinfo_file); + return 0; +} + #ifdef DOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") #else diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 2531c57e9..f6c158a99 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -484,6 +484,14 @@ blas_queue_t *tscq; main_status[cpu] = MAIN_RUNNING1; #endif +//For Loongson servers, like the 3C5000 (featuring 16 cores), applying an +//offset to the buffer is essential for minimizing cache conflicts and optimizing performance. +#if defined(LOONGSON3R5) && !defined(NO_AFFINITY) + char model_name[128]; + get_cpu_model(model_name); + if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL)) + if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); +#endif if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sb == NULL) { @@ -1006,7 +1014,7 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; -#if defined(ARCH_MIPS64) +#if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64) #ifndef DYNAMIC_ARCH //set parameters for different number of threads. blas_set_parameter(); diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 6f2ea8623..53e6c98e0 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -113,7 +113,7 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; adjust_thread_buffers(); -#if defined(ARCH_MIPS64) +#if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64) //set parameters for different number of threads. blas_set_parameter(); #endif diff --git a/driver/others/memory.c b/driver/others/memory.c index 4ee8f9a2e..c55688ace 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1219,7 +1219,7 @@ UNLOCK_COMMAND(&alloc_lock); if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif @@ -2814,7 +2814,7 @@ void *blas_memory_alloc(int procpos){ if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index de6bf0de4..a208a1a9d 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -739,6 +739,100 @@ void blas_set_parameter(void){ } #endif +#if defined(ARCH_LOONGARCH64) +int get_L3_size() { + int ret = 0, id = 0x14; + __asm__ volatile ( + "cpucfg %[ret], %[id]" + : [ret]"=r"(ret) + : [id]"r"(id) + : "memory" + ); + return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB +} + +void blas_set_parameter(void){ +#if defined(LOONGSON3R5) + int L3_size = get_L3_size(); +#ifdef SMP + if(blas_num_threads == 1){ +#endif + //single thread + if (L3_size == 32){ // 3C5000 and 3D5000 + sgemm_p = 256; + sgemm_q = 384; + sgemm_r = 8192; + + dgemm_p = 112; + dgemm_q = 289; + dgemm_r = 4096; + + cgemm_p = 128; + cgemm_q = 256; + cgemm_r = 4096; + + zgemm_p = 128; + zgemm_q = 128; + zgemm_r = 2048; + } else { // 3A5000 and 3C5000L + sgemm_p = 256; + sgemm_q = 384; + sgemm_r = 4096; + + dgemm_p = 112; + dgemm_q = 300; + dgemm_r = 3024; + + cgemm_p = 128; + cgemm_q = 256; + cgemm_r = 2048; + + zgemm_p = 128; + zgemm_q = 128; + zgemm_r = 1024; + } +#ifdef SMP + }else{ + //multi thread + if (L3_size == 32){ // 3C5000 and 3D5000 + sgemm_p = 256; + sgemm_q = 384; + sgemm_r = 1024; + + dgemm_p = 112; + dgemm_q = 289; + dgemm_r = 342; + + cgemm_p = 128; + cgemm_q = 256; + cgemm_r = 512; + + zgemm_p = 128; + zgemm_q = 128; + zgemm_r = 512; + } else { // 3A5000 and 3C5000L + sgemm_p = 256; + sgemm_q = 384; + sgemm_r = 2048; + + dgemm_p = 112; + dgemm_q = 300; + dgemm_r = 738; + + cgemm_p = 128; + cgemm_q = 256; + cgemm_r = 1024; + + zgemm_p = 128; + zgemm_q = 128; + zgemm_r = 1024; + } + } +#endif +#endif +} +#endif + #if defined(ARCH_ARM64) void blas_set_parameter(void) diff --git a/interface/gemm.c b/interface/gemm.c index 4778b641b..f552059b8 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -521,7 +521,18 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS buffer = (XFLOAT *)blas_memory_alloc(0); +//For Loongson servers, like the 3C5000 (featuring 16 cores), applying an +//offset to the buffer is essential for minimizing cache conflicts and optimizing performance. +#if defined(LOONGSON3R5) && !defined(NO_AFFINITY) + char model_name[128]; + get_cpu_model(model_name); + if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL)) + sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); + else + sa = (XFLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); +#else sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); +#endif sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 4c361f155..b573fc61b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1066,31 +1066,123 @@ static void init_parameter(void) { } #else // (ARCH_MIPS64) #if (ARCH_LOONGARCH64) +static int get_L3_size() { + int ret = 0, id = 0x14; + __asm__ volatile ( + "cpucfg %[ret], %[id]" + : [ret]"=r"(ret) + : [id]"r"(id) + : "memory" + ); + return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB +} static void init_parameter(void) { #ifdef BUILD_BFLOAT16 TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif + +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; +#endif + +#if defined(LOONGSON3R5) + int L3_size = get_L3_size(); +#ifdef SMP + if(blas_num_threads == 1){ +#endif + //single thread + if (L3_size == 32){ // 3C5000 and 3D5000 + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.sgemm_q = 384; + TABLE_NAME.sgemm_r = 8192; + + TABLE_NAME.dgemm_p = 112; + TABLE_NAME.dgemm_q = 289; + TABLE_NAME.dgemm_r = 4096; + + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.cgemm_q = 256; + TABLE_NAME.cgemm_r = 4096; + + TABLE_NAME.zgemm_p = 128; + TABLE_NAME.zgemm_q = 128; + TABLE_NAME.zgemm_r = 2048; + } else { // 3A5000 and 3C5000L + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.sgemm_q = 384; + TABLE_NAME.sgemm_r = 4096; + + TABLE_NAME.dgemm_p = 112; + TABLE_NAME.dgemm_q = 300; + TABLE_NAME.dgemm_r = 3024; + + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.cgemm_q = 256; + TABLE_NAME.cgemm_r = 2048; + + TABLE_NAME.zgemm_p = 128; + TABLE_NAME.zgemm_q = 128; + TABLE_NAME.zgemm_r = 1024; + } +#ifdef SMP + }else{ + //multi thread + if (L3_size == 32){ // 3C5000 and 3D5000 + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.sgemm_q = 384; + TABLE_NAME.sgemm_r = 1024; + + TABLE_NAME.dgemm_p = 112; + TABLE_NAME.dgemm_q = 289; + TABLE_NAME.dgemm_r = 342; + + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.cgemm_q = 256; + TABLE_NAME.cgemm_r = 512; + + TABLE_NAME.zgemm_p = 128; + TABLE_NAME.zgemm_q = 128; + TABLE_NAME.zgemm_r = 512; + } else { // 3A5000 and 3C5000L + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.sgemm_q = 384; + TABLE_NAME.sgemm_r = 2048; + + TABLE_NAME.dgemm_p = 112; + TABLE_NAME.dgemm_q = 300; + TABLE_NAME.dgemm_r = 738; + + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.cgemm_q = 256; + TABLE_NAME.cgemm_r = 1024; + + TABLE_NAME.zgemm_p = 128; + TABLE_NAME.zgemm_q = 128; + TABLE_NAME.zgemm_r = 1024; + } + } +#endif +#else TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -#ifdef BUILD_BFLOAT16 - TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; -#endif - TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; - TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; - TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; - TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; - -#ifdef BUILD_BFLOAT16 - TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; -#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#endif + +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; +#endif } #else // (ARCH_LOONGARCH64) #if (ARCH_POWER) diff --git a/param.h b/param.h index fef3a0991..9c85a3aee 100644 --- a/param.h +++ b/param.h @@ -2836,7 +2836,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SNUMOPT 2 #define DNUMOPT 2 -#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_A 0x20000 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL @@ -2866,20 +2866,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define QGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 -#define SGEMM_DEFAULT_P 256 -#define DGEMM_DEFAULT_P 32 +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p #define CGEMM_DEFAULT_P 128 -#define ZGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P zgemm_p -#define SGEMM_DEFAULT_R 1024 -#define DGEMM_DEFAULT_R 858 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R zgemm_r -#define SGEMM_DEFAULT_Q 256 -#define DGEMM_DEFAULT_Q 152 +#define SGEMM_DEFAULT_Q sgemm_q +#define DGEMM_DEFAULT_Q dgemm_q #define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q zgemm_q #define SYMV_P 16 #endif