From c2323dd4d2a65420f77c73f7b55c41ba469a47f8 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 15:18:42 +0200 Subject: [PATCH 1/7] really fix ARM locking - was writing 0 to lock variable, so was ineffective - only exit loop if both lock was 0 and strex was successful --- common_arm.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/common_arm.h b/common_arm.h index 135191057..2dabd4d7f 100644 --- a/common_arm.h +++ b/common_arm.h @@ -59,22 +59,16 @@ static void __inline blas_lock(volatile BLASULONG *address){ while (*address) {YIELDING;}; __asm__ __volatile__( - "1: \n\t" - "ldrex r2, [%1] \n\t" - "mov r2, #0 \n\t" - "strex r3, r2, [%1] \n\t" - "cmp r3, #0 \n\t" - "bne 1b \n\t" - "mov %0 , r3 \n\t" - : "=r"(ret), "=r"(address) - : "1"(address) - : "memory", "r2" , "r3" - - + "ldrex r2, [%1] \n\t" + "strex %0, %2, [%1] \n\t" + "orr %0, r2 \n\t" + : "=&r"(ret) + : "r"(address), "r"(1) + : "memory", "r2" ); } while (ret); - + MB; } From d3e2f0a1af73a6e74258294c911e7f4cb72d8ab5 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 15:37:02 +0200 Subject: [PATCH 2/7] add missing barriers should fix issue #597 --- driver/others/blas_server.c | 11 ++++++++++- driver/others/memory.c | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index b3b1ce7bd..1fd848c6b 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -425,6 +425,10 @@ static int blas_thread_server(void *arg){ main_status[cpu] = MAIN_FINISH; #endif + // arm: make sure all results are written out _before_ + // thread is marked as done and other threads use them + WMB; + thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ WMB; @@ -775,7 +779,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ stop = rpcc(); #endif - if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + if ((num > 1) && queue -> next) { + exec_blas_async_wait(num - 1, queue -> next); + + // arm: make sure results from other threads are visible + MB; + } #ifdef TIMING_DEBUG fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", diff --git a/driver/others/memory.c b/driver/others/memory.c index a562da377..49c57f911 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1153,6 +1153,9 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif + // arm: ensure all writes are finished before other thread takes this memory + WMB; + memory[position].used = 0; #ifdef DEBUG From e12cf1123e8784ce6fe9d2ac14526331fbe2c555 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 17:27:25 +0200 Subject: [PATCH 3/7] add fallback rpcc implementation - use on arm, arm64 and any new platform - use faster integer math instead of double - use similar scale as rdtsc so that timeouts work --- common.h | 28 ++++++++++++++++++++++++++++ common_alpha.h | 1 + common_arm.h | 10 ---------- common_arm64.h | 10 ---------- common_ia64.h | 2 ++ common_mips64.h | 1 + common_power.h | 1 + common_sparc.h | 1 + common_x86.h | 1 + common_x86_64.h | 1 + 10 files changed, 36 insertions(+), 20 deletions(-) diff --git a/common.h b/common.h index 320adadcb..5998b5608 100644 --- a/common.h +++ b/common.h @@ -410,7 +410,35 @@ typedef char env_var_t[MAX_PATH]; typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) #endif + +#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) +#ifdef _POSIX_MONOTONIC_CLOCK +#if defined(__GNUC_PREREQ) && __GLIBC_PREREQ(2, 17) // don't require -lrt +#define USE_MONOTONIC +#elif defined(OS_ANDROID) +#define USE_MONOTONIC #endif +#endif +/* use similar scale as x86 rdtsc for timeouts to work correctly */ +static inline unsigned long long rpcc(void){ +#ifdef USE_MONOTONIC + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; +#else + struct timeval tv; + gettimeofday(&tv,NULL); + return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; +#endif +} +#define RPCC_DEFINED +#define RPCC64BIT +#endif // !RPCC_DEFINED + +#ifndef RPCC_DEFINED +#error "rpcc() implementation is missing for your platform" +#endif +#endif // !ASSEMBLER #ifdef OS_LINUX #include "common_linux.h" diff --git a/common_alpha.h b/common_alpha.h index 845fb316a..86f58966a 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -89,6 +89,7 @@ static __inline unsigned int rpcc(void){ return r0; } +#define RPCC_DEFINED #define HALT ldq $0, 0($0) diff --git a/common_arm.h b/common_arm.h index 2dabd4d7f..7e0c02306 100644 --- a/common_arm.h +++ b/common_arm.h @@ -72,16 +72,6 @@ static void __inline blas_lock(volatile BLASULONG *address){ } -static inline unsigned long long rpcc(void){ - unsigned long long ret=0; - double v; - struct timeval tv; - gettimeofday(&tv,NULL); - v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; - ret = (unsigned long long) ( v * 1000.0d ); - return ret; -} - static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } diff --git a/common_arm64.h b/common_arm64.h index aa310c5f2..cc08fa75b 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -71,16 +71,6 @@ static void __inline blas_lock(volatile BLASULONG *address){ } -static inline unsigned long long rpcc(void){ - unsigned long long ret=0; - double v; - struct timeval tv; - gettimeofday(&tv,NULL); - v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; - ret = (unsigned long long) ( v * 1000.0d ); - return ret; -} - static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } diff --git a/common_ia64.h b/common_ia64.h index 8e92b5992..d1f210749 100644 --- a/common_ia64.h +++ b/common_ia64.h @@ -75,6 +75,7 @@ static __inline unsigned long rpcc(void) { __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks)); return clocks; } +#define RPCC_DEFINED static __inline unsigned long stmxcsr(void){ @@ -103,6 +104,7 @@ static __inline void blas_lock(volatile unsigned long *address){ static __inline unsigned int rpcc(void) { return __getReg(_IA64_REG_AR_ITC); } +#define RPCC_DEFINED static __inline unsigned int stmxcsr(void) { return __getReg(_IA64_REG_AR_FPSR); diff --git a/common_mips64.h b/common_mips64.h index 7cd86b375..bc1a52fb4 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -118,6 +118,7 @@ static inline unsigned int rpcc(void){ #endif return ret; } +#define RPCC_DEFINED #if defined(LOONGSON3A) || defined(LOONGSON3B) #ifndef NO_AFFINITY diff --git a/common_power.h b/common_power.h index e9b5cb630..3b9471a17 100644 --- a/common_power.h +++ b/common_power.h @@ -103,6 +103,7 @@ static inline unsigned long rpcc(void){ #endif } +#define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT diff --git a/common_sparc.h b/common_sparc.h index 87ef75276..8a16e3d3a 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -66,6 +66,7 @@ static __inline unsigned long rpcc(void){ return clocks; }; +#define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT diff --git a/common_x86.h b/common_x86.h index 99a723fd7..9506716ce 100644 --- a/common_x86.h +++ b/common_x86.h @@ -73,6 +73,7 @@ static __inline unsigned long long rpcc(void){ return ((unsigned long long)a + ((unsigned long long)d << 32)); }; +#define RPCC_DEFINED static __inline unsigned long getstackaddr(void){ unsigned long addr; diff --git a/common_x86_64.h b/common_x86_64.h index efb902416..3a02beefb 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -82,6 +82,7 @@ static __inline BLASULONG rpcc(void){ return ((BLASULONG)a + ((BLASULONG)d << 32)); } +#define RPCC_DEFINED #define RPCC64BIT From f2ac1a5cee9eebfaad33194e362fa2c05e2b05d9 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 18:08:45 +0200 Subject: [PATCH 4/7] set ARMV7 for Cortex-A9 and Cortex-A15 otherwise some macros like YIELDING are not defined correctly --- common_arm.h | 4 ++++ cpuid_arm.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/common_arm.h b/common_arm.h index 7e0c02306..74b6378dd 100644 --- a/common_arm.h +++ b/common_arm.h @@ -124,4 +124,8 @@ REALNAME: #define MAP_ANONYMOUS MAP_ANON #endif +#if !defined(ARMV5) && !defined(ARMV6) && !defined(ARMV7) && !defined(ARMV8) +#error "you must define ARMV5, ARMV6, ARMV7 or ARMV8" +#endif + #endif diff --git a/cpuid_arm.c b/cpuid_arm.c index 51ba72d70..6485003f3 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -192,6 +192,7 @@ void get_cpuconfig(void) { case CPU_CORTEXA9: printf("#define CORTEXA9\n"); + printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); @@ -207,6 +208,7 @@ void get_cpuconfig(void) case CPU_CORTEXA15: printf("#define CORTEXA15\n"); + printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); From 6b92204a7ce5faf8dab2301c59aa69a26f6b8a19 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 18:10:34 +0200 Subject: [PATCH 5/7] add fallback blas_lock implementation to be used on armv5 and new platforms --- common.h | 14 ++++++++++++++ common_alpha.h | 1 + common_arm.h | 4 ++++ common_arm64.h | 1 + common_ia64.h | 2 ++ common_mips64.h | 1 + common_power.h | 1 + common_sparc.h | 1 + common_x86.h | 1 + common_x86_64.h | 1 + 10 files changed, 27 insertions(+) diff --git a/common.h b/common.h index 5998b5608..6073f037f 100644 --- a/common.h +++ b/common.h @@ -435,9 +435,23 @@ static inline unsigned long long rpcc(void){ #define RPCC64BIT #endif // !RPCC_DEFINED +#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) +static void __inline blas_lock(volatile BLASULONG *address){ + + do { + while (*address) {YIELDING;}; + + } while (!__sync_bool_compare_and_swap(address, 0, 1)); +} +#define BLAS_LOCK_DEFINED +#endif + #ifndef RPCC_DEFINED #error "rpcc() implementation is missing for your platform" #endif +#ifndef BLAS_LOCK_DEFINED +#error "blas_lock() implementation is missing for your platform" +#endif #endif // !ASSEMBLER #ifdef OS_LINUX diff --git a/common_alpha.h b/common_alpha.h index 86f58966a..9739c941d 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -76,6 +76,7 @@ static void __inline blas_lock(unsigned long *address){ "30:", address); #endif } +#define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void){ diff --git a/common_arm.h b/common_arm.h index 74b6378dd..84691d766 100644 --- a/common_arm.h +++ b/common_arm.h @@ -51,6 +51,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER +#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8) + static void __inline blas_lock(volatile BLASULONG *address){ int register ret; @@ -71,6 +73,8 @@ static void __inline blas_lock(volatile BLASULONG *address){ MB; } +#define BLAS_LOCK_DEFINED +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; diff --git a/common_arm64.h b/common_arm64.h index cc08fa75b..c4e588d1f 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -69,6 +69,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static inline int blas_quickdivide(blasint x, blasint y){ diff --git a/common_ia64.h b/common_ia64.h index d1f210749..72b75fc4e 100644 --- a/common_ia64.h +++ b/common_ia64.h @@ -68,6 +68,7 @@ static __inline void blas_lock(volatile unsigned long *address){ : "ar.ccv", "memory"); } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void) { unsigned long clocks; @@ -100,6 +101,7 @@ static __inline void blas_lock(volatile unsigned long *address){ while (*address || _InterlockedCompareExchange((volatile int *) address,1,0)) ; } +#define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void) { return __getReg(_IA64_REG_AR_ITC); diff --git a/common_mips64.h b/common_mips64.h index bc1a52fb4..f5c0ec7cf 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -98,6 +98,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static inline unsigned int rpcc(void){ unsigned long ret; diff --git a/common_power.h b/common_power.h index 3b9471a17..ab331b04a 100644 --- a/common_power.h +++ b/common_power.h @@ -87,6 +87,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ #endif } while (ret); } +#define BLAS_LOCK_DEFINED static inline unsigned long rpcc(void){ unsigned long ret; diff --git a/common_sparc.h b/common_sparc.h index 8a16e3d3a..f99972db9 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -58,6 +58,7 @@ static void __inline blas_lock(volatile unsigned long *address){ : "memory"); } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void){ unsigned long clocks; diff --git a/common_x86.h b/common_x86.h index 9506716ce..6c90432a2 100644 --- a/common_x86.h +++ b/common_x86.h @@ -65,6 +65,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long long rpcc(void){ unsigned int a, d; diff --git a/common_x86_64.h b/common_x86_64.h index 3a02beefb..4c783b315 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -74,6 +74,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static __inline BLASULONG rpcc(void){ BLASULONG a, d; From d38a1ddc7a4ef8c10017ae5b81a447e322721b94 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 18:13:30 +0200 Subject: [PATCH 6/7] use real armv5 support there is no more requirement for ARMv6 instructions, and VFP on ARMv5 is uncommon --- Makefile.arm | 4 ++-- common_arm.h | 5 ++++- getarch.c | 3 +-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index 2f7b33730..272220ca9 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -26,8 +26,8 @@ endif ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 -FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +CCOMMON_OPT += -marm -march=armv5 +FCOMMON_OPT += -marm -march=armv5 endif diff --git a/common_arm.h b/common_arm.h index 84691d766..6bf836835 100644 --- a/common_arm.h +++ b/common_arm.h @@ -80,7 +80,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } -#if defined(DOUBLE) +#if !defined(HAVE_VFP) +/* no FPU, soft float */ +#define GET_IMAGE(res) +#elif defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") diff --git a/getarch.c b/getarch.c index d56a37a7a..89e736a31 100644 --- a/getarch.c +++ b/getarch.c @@ -798,8 +798,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DARMV5 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ - "-DHAVE_VFP" + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " #define LIBNAME "armv5" #define CORENAME "ARMV5" #else From 3efeaed0d867c9d54701e9351de44e747cd21578 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 16 Aug 2015 20:11:13 +0200 Subject: [PATCH 7/7] correct a minor mistake --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 6073f037f..a607c888b 100644 --- a/common.h +++ b/common.h @@ -413,7 +413,7 @@ typedef char* env_var_t; #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) #ifdef _POSIX_MONOTONIC_CLOCK -#if defined(__GNUC_PREREQ) && __GLIBC_PREREQ(2, 17) // don't require -lrt +#if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 17) // don't require -lrt #define USE_MONOTONIC #elif defined(OS_ANDROID) #define USE_MONOTONIC