Compare commits

...

30 Commits

Author SHA1 Message Date
Martin Kroeker
e8a68ef261 Merge pull request #1702 from xianyi/develop
Merge develop for 0.3.2
2018-07-30 07:25:01 +02:00
Martin Kroeker
64826a0d7d Merge branch 'release-0.3.0' into develop 2018-07-29 22:37:09 +02:00
Martin Kroeker
25f2d25cfe Merge pull request #1697 from martin-frbg/issue1696
Do not treat WIndows UWB builds as cross-compiling
2018-07-25 19:55:29 +02:00
Martin Kroeker
73131fa30a Do not treat WIndows UWB builds as cross-compiling 2018-07-24 17:46:33 +02:00
Martin Kroeker
66fcdd5be8 Merge pull request #1695 from martin-frbg/issue1692
Unset memory table entry, not just the local pointer to it on shutdown
2018-07-22 16:34:09 +02:00
Martin Kroeker
43ac839c16 Unset memory table entry, not just the temporary pointer to it on shutdown
to fix crash with multiple instances of OpenBLAS, #1692
2018-07-22 09:19:19 +02:00
Martin Kroeker
7ba5936ecd Merge pull request #1688 from martin-frbg/issue1673
Temporarily disable special handling of OPENMP thread memory allocation
2018-07-19 19:03:45 +02:00
Martin Kroeker
b14f44d2ad Temporarily disable special handling of OPENMP thread memory allocation
for issue #1673
2018-07-19 08:57:56 +02:00
Martin Kroeker
e71d70ba87 Merge pull request #1681 from martin-frbg/issue1671
Add cpu identification via mfpvr call for the BSDs
2018-07-16 22:47:05 +02:00
Martin Kroeker
d671870f5f Merge pull request #1684 from martin-frbg/issue1672
Work around utest failures in the MIPS64 SICORTEX target
2018-07-16 22:46:49 +02:00
Martin Kroeker
4e103c822c typo fix 2018-07-16 12:56:39 +02:00
Martin Kroeker
d2142760e0 Fix precision problem in DSDOT 2018-07-15 17:11:40 +02:00
Martin Kroeker
2fbfc64da8 Use C kernels for default c/zAXPY, xROT, c/zSWAP 2018-07-15 17:09:55 +02:00
Martin Kroeker
8d5b33b6be Add cpu identification via mfpvr call for the BSDs
fixes #1671
2018-07-12 23:39:00 +02:00
Martin Kroeker
36aea5ce2d Merge pull request #1680 from martin-frbg/snprint
Fix wrong redefinitions of snprintf for older MSVC
2018-07-12 14:05:13 +02:00
Martin Kroeker
1309711e24 Fix declaration of snprintf for older MSVC
_snprintf_s takes an additional (size) argument, so is no direct replacement.
(Note that this code is currently unused - the two instances of snprintf here are within ifdef blocks that are not compiled for MSVC)
2018-07-12 11:47:52 +02:00
Martin Kroeker
571e9de2ac Fix definition of snprintf for MSVC
MS _snprintf_s takes an additional argument for the size of the buffer, so is not a direct replacement (utest/ctest.h from which I copied was wrong)
2018-07-12 11:42:25 +02:00
Martin Kroeker
448ed15115 Merge pull request #1678 from martin-frbg/issue1677
Define snprintf for older versions of MSVC
2018-07-12 09:21:34 +02:00
Martin Kroeker
045fb5ea2c Define snprintf for older versions of MSVC
for #1677
2018-07-12 07:30:58 +02:00
Martin Kroeker
4dd70d98d7 Merge pull request #1667 from xianyi/revert-1642-develop
Revert "Rewrite &= -> = and simplify the initial blocking phase."
2018-07-04 08:27:21 +02:00
Martin Kroeker
504310eeb9 Merge pull request #1665 from martin-frbg/cpuid-ryzen2
Add cpuid for AMD Ryzen 2
2018-07-04 08:19:40 +02:00
Martin Kroeker
ea1f39518f Merge pull request #1663 from martin-frbg/issue1641
Double MAX_ALLOCATING_THREADS to fix segfaults with Go and Octave
2018-07-04 08:19:11 +02:00
Martin Kroeker
d0ec4325cf Add cpuid for AMD Ryzen 2 2018-07-03 21:03:24 +02:00
Martin Kroeker
3f73e8b8cf Add cpuid for AMD Ryzen 2
for #1664
2018-07-03 21:01:35 +02:00
Martin Kroeker
a49203b48c Double MAX_ALLOCATING_THREADS to fix segfaults with Go and Octave
for #1641
2018-07-03 17:35:54 +02:00
Martin Kroeker
c6aec89d10 Merge pull request #1657 from martin-frbg/release-0.3.0
Release 0.3.1
2018-07-01 12:03:07 +02:00
Martin Kroeker
e6d7711199 remove dev suffix from version number 2018-07-01 11:59:47 +02:00
Martin Kroeker
7a914347c5 remove dev suffix from version number 2018-07-01 11:58:57 +02:00
Martin Kroeker
3a8f0a6a1f Merge pull request #1656 from xianyi/develop
Update the 0.3 branch from develop
2018-07-01 11:55:21 +02:00
Martin Kroeker
939452ea9d Merge pull request #1570 from xianyi/develop
Update release-0.3.0 branch to match develop
2018-05-23 15:12:20 +02:00
11 changed files with 234 additions and 22 deletions

View File

@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 2.dev)
set(OpenBLAS_PATCH_VERSION 2)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.2.dev
VERSION = 0.3.2
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
endif ()
# Cannot run getarch on target if we are cross-compiling
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
# Write to config as getarch would
# TODO: Set up defines that getarch sets up based on every other target

View File

@@ -142,6 +142,52 @@ int detect(void){
return CPUTYPE_PPC970;
#endif
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
int id;
id = __asm __volatile("mfpvr %0" : "=r"(id));
switch ( id >> 16 ) {
case 0x4e: // POWER9
return return CPUTYPE_POWER8;
break;
case 0x4d:
case 0x4b: // POWER8/8E
return CPUTYPE_POWER8;
break;
case 0x4a:
case 0x3f: // POWER7/7E
return CPUTYPE_POWER6;
break;
case 0x3e:
return CPUTYPE_POWER6;
break;
case 0x3a:
return CPUTYPE_POWER5;
break;
case 0x35:
case 0x38: // POWER4 /4+
return CPUTYPE_POWER4;
break;
case 0x40:
case 0x41: // POWER3 /3+
return CPUTYPE_POWER3;
break;
case 0x39:
case 0x3c:
case 0x44:
case 0x45:
return CPUTYPE_PPC970;
break;
case 0x70:
return CPUTYPE_CELL;
break;
case 0x8003:
return CPUTYPE_PPCG4;
break;
default:
return CPUTYPE_UNKNOWN;
}
#endif
}
void get_architecture(void){

View File

@@ -1452,6 +1452,8 @@ int get_cpuname(void){
switch (model) {
case 1:
// AMD Ryzen
case 8:
// AMD Ryzen2
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;

View File

@@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){
}
}
} else if (exfamily == 8) {
if (model == 1) {
if (model == 1 || model == 8) {
if(support_avx())
return &gotoblas_ZEN;
else{

View File

@@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifndef BUFFERS_PER_THREAD
#ifdef USE_OPENMP
#ifdef USE_OPENMP_UNUSED
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#else
#define BUFFERS_PER_THREAD NUM_BUFFERS
@@ -363,7 +363,7 @@ int blas_get_cpu_number(void){
#endif
// blas_goto_num = 0;
#ifndef USE_OPENMP
#ifndef USE_OPENMP_UNUSED
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
@@ -494,10 +494,10 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
#endif
/* Holds pointers to allocated memory */
#if defined(SMP) && !defined(USE_OPENMP)
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
/* This is the number of threads than can be spawned by the server, which is the
server plus the number of threads in the thread pool */
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2
static int next_memory_table_pos = 0;
# if defined(HAS_COMPILER_TLS)
/* Use compiler generated thread-local-storage */
@@ -532,7 +532,7 @@ static BLASULONG alloc_lock = 0UL;
/* Returns a pointer to the start of the per-thread memory allocation data */
static __inline struct alloc_t ** get_memory_table() {
#if defined(SMP) && !defined(USE_OPENMP)
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
@@ -1057,7 +1057,7 @@ static volatile int memory_initialized = 0;
/* 2 : Thread */
static void blas_memory_init(){
#if defined(SMP) && !defined(USE_OPENMP)
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
next_memory_table_pos = 0;
# if !defined(HAS_COMPILER_TLS)
# if defined(OS_WINDOWS)
@@ -1279,7 +1279,7 @@ void blas_shutdown(void){
struct alloc_t *alloc_info = local_memory_table[thread][pos];
if (alloc_info) {
alloc_info->release_func(alloc_info);
alloc_info = (void *)0;
local_memory_table[thread][pos] = (void *)0;
}
}
}

View File

@@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <string.h>
#if defined(_WIN32) && defined(_MSC_VER)
#if _MSC_VER < 1900
#define snprintf _snprintf
#endif
#endif
static char* openblas_config_str=""
#ifdef USE64BITINT
"USE64BITINT "

View File

@@ -1,3 +1,12 @@
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
SROTKERNEL = ../mips/rot.c
DROTKERNEL = ../mips/rot.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
ifndef SNRM2KERNEL
SNRM2KERNEL = snrm2.S
endif

View File

@@ -103,35 +103,83 @@
.align 3
.L12:
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 4 * SIZE(X)
LD b1, 4 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
LD a2, 5 * SIZE(X)
LD b2, 5 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
LD a3, 6 * SIZE(X)
LD b3, 6 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
LD a4, 7 * SIZE(X)
LD b4, 7 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 8 * SIZE(X)
LD b1, 8 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
LD a2, 9 * SIZE(X)
LD b2, 9 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
LD a3, 10 * SIZE(X)
LD b3, 10 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
LD a4, 11 * SIZE(X)
LD b4, 11 * SIZE(Y)
@@ -143,29 +191,77 @@
.align 3
.L13:
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 4 * SIZE(X)
LD b1, 4 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
LD a2, 5 * SIZE(X)
LD b2, 5 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
LD a3, 6 * SIZE(X)
LD b3, 6 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
LD a4, 7 * SIZE(X)
LD b4, 7 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
daddiu X, X, 8 * SIZE
#ifdef DSDOT
cvt.d.s a2, a2
cvt.d.s b2, b2
madd.d s2, s2, a2, b2
#else
MADD s2, s2, a2, b2
#endif
daddiu Y, Y, 8 * SIZE
#ifdef DSDOT
cvt.d.s a3, a3
cvt.d.s b3, b3
madd.d s1, s1, a3, b3
#else
MADD s1, s1, a3, b3
#endif
#ifdef DSDOT
cvt.d.s a4, a4
cvt.d.s b4, b4
madd.d s2, s2, a4, b4
#else
MADD s2, s2, a4, b4
#endif
.align 3
.L15:
@@ -179,8 +275,13 @@
LD a1, 0 * SIZE(X)
LD b1, 0 * SIZE(Y)
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
daddiu I, I, -1
daddiu X, X, SIZE
@@ -225,50 +326,85 @@
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
dadd Y, Y, INCY
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
LD a1, 0 * SIZE(X)
dadd X, X, INCX
LD b1, 0 * SIZE(Y)
@@ -277,7 +413,13 @@
daddiu I, I, -1
bgtz I, .L23
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s2, s2, a1, b1
#else
MADD s2, s2, a1, b1
#endif
.align 3
.L25:
@@ -296,13 +438,20 @@
daddiu I, I, -1
bgtz I, .L26
#ifdef DSDOT
cvt.d.s a1, a1
cvt.d.s b1, b1
madd.d s1, s1, a1, b1
#else
MADD s1, s1, a1, b1
#endif
.align 3
.L999:
ADD s1, s1, s2
#ifdef DSDOT
cvt.d.s s1, s1
add.d s1, s1, s2
#else
ADD s1, s1, s2
#endif
j $31
NOP

View File

@@ -84,7 +84,7 @@ struct ctest {
#endif
#if _MSC_VER < 1900
#define snprintf _snprintf_s
#define snprintf _snprintf
#endif
#ifndef __cplusplus