From 84453b924fe7695029cad974dfe0cf7bf6ffe0f6 Mon Sep 17 00:00:00 2001 From: "Kai T. Ohlhus" Date: Thu, 22 Sep 2022 00:20:40 +0900 Subject: [PATCH 1/2] Support CONSISTENT_FPCSR on AARCH64 --- driver/others/blas_server.c | 8 ++++++++ driver/others/blas_server_omp.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 9cfd825ec..051513f27 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -470,9 +470,13 @@ blas_queue_t *tscq; #endif #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); #endif +#endif #ifdef MONITOR main_status[cpu] = MAIN_RUNNING1; @@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ queue -> position = pos; #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); #endif +#endif #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index c158f92ee..e06ab8404 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -284,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ sb = queue -> sb; #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif #endif if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { @@ -383,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ #ifdef CONSISTENT_FPCSR for (i = 0; i < num; i ++) { +#ifdef __aarch64__ + __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode)); +#else __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); +#endif } #endif From c2892f0e31d41f5e8d6c1324c6592459c19b4c59 Mon Sep 17 00:00:00 2001 From: "Kai T. Ohlhus" Date: Thu, 22 Sep 2022 00:25:13 +0900 Subject: [PATCH 2/2] Makefile.rule: update CONSISTENT_FPCSR documentation --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 359672359..a0ad90a68 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -207,7 +207,7 @@ NO_AFFINITY = 1 # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 -# If you need to synchronize FP CSR between threads (for x86/x86_64 only). +# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only). # CONSISTENT_FPCSR = 1 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute