Merge branch 'loongson3a' of github.com:xianyi/OpenBLAS into loongson3a

This commit is contained in:
traz 2011-06-24 09:28:12 +00:00
commit 68532fa9ec
40 changed files with 609 additions and 68 deletions

5
.gitignore vendored
View File

@ -1,8 +1,13 @@
*.obj
*.lib
*.dll
*.def
*.o
lapack-3.1.1
lapack-3.1.1.tgz
*.so
*.a
.svn
*~
config.h
Makefile.conf

View File

@ -1,13 +1,40 @@
OpenBLAS ChangeLog
====================================================================
Version 0.1 alpha2(in development)
Version 0.1 alpha2
23-Jun-2011
common:
*
* Fixed blasint undefined bug in <cblas.h> file. Other software
could include this header successfully(Refs issue #13 on github)
* Fixed the SEGFAULT bug on 64 cores. On SMP server, the number
of CPUs or cores should be less than or equal to 64.(Refs issue #14
on github)
* Support "void goto_set_num_threads(int num_threads)" and "void
openblas_set_num_threads(int num_threads)" when USE_OPENMP=1
* Added extern "C" to support C++. Thank Tasio for the patch(Refs
issue #21 on github)
* Provided an error message when the arch is not supported.(Refs
issue #19 on github)
* Fixed issue #23. Fixed a bug of f_check script about generating link flags.
* Added openblas_set_num_threads for Fortran.
* Fixed #25 a wrong result of rotmg.
* Fixed a bug about detecting underscore prefix in c_check.
* Print the wall time (cycles) with enabling FUNCTION_PROFILE
* Fixed #35 a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1
* Added install target. You can use "make install". (Refs #20)
x86/x86_64:
*
* Fixed #28 a wrong result of dsdot on x86_64.
* Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6.
* Fixed #33 ztrmm bug on Nehalem.
* Walk round #27 the low performance axpy issue with small imput size & multithreads.
MIPS64:
*
* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
* Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2)
* Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3)
====================================================================
Version 0.1 alpha1
20-Mar-2011

View File

@ -15,6 +15,10 @@ ifdef SANITY_CHECK
BLASDIRS += reference
endif
ifndef PREFIX
PREFIX = /opt/OpenBLAS
endif
SUBDIRS = $(BLASDIRS)
ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
@ -22,8 +26,8 @@ endif
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib test ctest shared
.NOTPARALLEL : all libs prof lapack-test
.PHONY : all libs netlib test ctest shared install
.NOTPARALLEL : all libs prof lapack-test install
all :: libs netlib tests shared
@echo
@ -70,7 +74,7 @@ ifeq ($(OSNAME), Darwin)
endif
ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll
# -ln -fs $(LIBDLLNAME) libopenblas.dll
-ln -fs $(LIBDLLNAME) libopenblas.dll
endif
ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll
@ -96,18 +100,26 @@ endif
endif
libs :
ifeq ($(CORE), UNKOWN)
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
endif
-ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX)
for d in $(SUBDIRS) ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
#Save the config files for installation
cp Makefile.conf Makefile.conf_last
cp config.h config_last.h
ifdef DYNAMIC_ARCH
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done
echo DYNAMIC_ARCH=1 >> Makefile.conf_last
endif
touch lib.grd
prof : prof_blas prof_lapack
@ -227,19 +239,23 @@ lapack-test :
dummy :
install :
$(MAKE) -f Makefile.install install
clean ::
@for d in $(SUBDIRS_ALL) ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
ifdef DYNAMIC_ARCH
#ifdef DYNAMIC_ARCH
@$(MAKE) -C kernel clean
endif
#endif
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d lapack-3.1.1; then \
echo deleting lapack-3.1.1; \
rm -rf lapack-3.1.1 ;\
fi
@rm -f *.grd Makefile.conf_last config_last.h
@echo Done.

65
Makefile.install Normal file
View File

@ -0,0 +1,65 @@
TOPDIR = .
export GOTOBLAS_MAKEFILE = 1
-include $(TOPDIR)/Makefile.conf_last
include ./Makefile.system
.PHONY : install
.NOTPARALLEL : install
lib.grd :
$(error OpenBLAS: Please run "make" firstly)
install : lib.grd
@-mkdir -p $(PREFIX)
@echo Generating openblas_config.h in $(PREFIX)
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(PREFIX)/openblas_config.h
@echo \#define OPENBLAS_CONFIG_H >> $(PREFIX)/openblas_config.h
@cat config_last.h >> $(PREFIX)/openblas_config.h
@echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(PREFIX)/openblas_config.h
@cat openblas_config_template.h >> $(PREFIX)/openblas_config.h
@echo \#endif >> $(PREFIX)/openblas_config.h
@echo Generating f77blas.h in $(PREFIX)
@echo \#ifndef OPENBLAS_F77BLAS_H > $(PREFIX)/f77blas.h
@echo \#define OPENBLAS_F77BLAS_H >> $(PREFIX)/f77blas.h
@echo \#include \"openblas_config.h\" >> $(PREFIX)/f77blas.h
@cat common_interface.h >> $(PREFIX)/f77blas.h
@echo \#endif >> $(PREFIX)/f77blas.h
@echo Generating cblas.h in $(PREFIX)
@sed 's/common/openblas_config/g' cblas.h > $(PREFIX)/cblas.h
#for install static library
@echo Copy the static library to $(PREFIX)
@cp $(LIBNAME) $(PREFIX)
@-ln -fs $(PREFIX)/$(LIBNAME) $(PREFIX)/libopenblas.$(LIBSUFFIX)
#for install shared library
@echo Copy the shared library to $(PREFIX)
ifeq ($(OSNAME), Linux)
-cp $(LIBSONAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
endif
ifeq ($(OSNAME), FreeBSD)
-cp $(LIBSONAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
endif
ifeq ($(OSNAME), NetBSD)
-cp $(LIBSONAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
endif
ifeq ($(OSNAME), Darwin)
-cp $(LIBDYNNAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBDYNNAME) $(PREFIX)/libopenblas.dylib
endif
ifeq ($(OSNAME), WINNT)
-cp $(LIBDLLNAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll
endif
ifeq ($(OSNAME), CYGWIN_NT)
-cp $(LIBDLLNAME) $(PREFIX)
-ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll
endif
@echo Install OK!

View File

@ -91,6 +91,9 @@ VERSION = 0.1alpha2
# SANITY_CHECK to compare the result with reference BLAS.
# UTEST_CHECK = 1
# The installation directory.
# PREFIX = /opt/OpenBLAS
# Common Optimization Flag; -O2 is enough.
# DEBUG = 1

View File

@ -30,6 +30,10 @@ ifdef TARGET
GETARCH_FLAGS += -DFORCE_$(TARGET)
endif
ifdef INTERFACE64
GETARCH_FLAGS += -DUSE64BITINT
endif
# This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
@ -185,7 +189,7 @@ ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -wd981
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fopenmp
endif
@ -489,7 +493,8 @@ endif
ifdef BINARY64
ifdef INTERFACE64
CCOMMON_OPT += -DUSE64BITINT
CCOMMON_OPT +=
#-DUSE64BITINT
endif
endif
@ -510,6 +515,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
CCOMMON_OPT += -DDYNAMIC_ARCH
endif
ifeq ($(NO_LAPACK), 1)
CCOMMON_OPT += -DNO_LAPACK
endif
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER

30
README
View File

@ -8,7 +8,9 @@ Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or,
check out codes from git://github.com/xianyi/OpenBLAS.git
1)Normal compile
Please read GotoBLAS_02QuickInstall.txt or type "make"
(a) type "make" to detect the CPU automatically.
or
(b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
2)Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
@ -20,6 +22,11 @@ make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-g
3)Debug version
make DEBUG=1
4)Intall to the directory (Optional)
e.g.
make install PREFIX=your_installation_directory
The default directory is /opt/OpenBLAS
3.Support CPU & OS
Please read GotoBLAS_01Readme.txt
@ -39,13 +46,17 @@ export GOTO_NUM_THREADS=4
or
export OMP_NUM_THREADS=4
The priorities are OPENBLAS_NUM_THREAD > GOTO_NUM_THREADS > OMP_NUM_THREADS.
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
If you compile this lib with USE_OPENMP=1, you should only set OMP_NUM_THREADS environment variable.
4.2 Set the number of threads with calling functions. for example,
void goto_set_num_threads(int num_threads);
or
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
5.Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
@ -56,4 +67,17 @@ Optimization on ICT Loongson 3A CPU
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
8.ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
9.Known Issues
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
* This library is not compatible with EKOPath Compiler Suite 4.0.10 (http://www.pathscale.com/ekopath-compiler-suite). However, Path64 (https://github.com/path64/compiler) could compile the codes successfully.
10. Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages

57
TargetList.txt Normal file
View File

@ -0,0 +1,57 @@
Force Target Examples:
make TARGET=NEHALEM
make TARGET=LOONGSON3A BINARY=64
make TARGET=ISTANBUL
Supported List:
1.X86/X86_64
a)Intel CPU:
P2
COPPERMINE
KATMAI
NORTHWOOD
PRESCOTT
BANIAS
YONAH
CORE2
PENRYN
DUNNINGTON
NEHALEM
ATOM
b)AMD CPU:
ATHLON
OPTERON
OPTERON_SSE3
BARCELONA
SHANGHAI
ISTANBUL
c)VIA CPU:
SSE_GENERIC
VIAC3
NANO
2.Power CPU:
POWER4
POWER5
POWER6
PPCG4
PPC970
PPC970MP
PPC440
PPC440FP2
CELL
3.MIPS64 CPU:
SICORTEX
LOONGSON3A
4.IA64 CPU:
ITANIUM2
5.SPARC CPU:
SPARC
SPARCV7

View File

@ -149,7 +149,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/);
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$data =~ /globl\ ([_\.]*)(.*)/;
$data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1;

14
cblas.h
View File

@ -1,6 +1,14 @@
#ifndef CBLAS_H
#define CBLAS_H
#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */
#include <stddef.h>
#include "common.h"
#define CBLAS_INDEX size_t
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
@ -270,4 +278,10 @@ void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANS
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif

View File

@ -39,6 +39,11 @@
#ifndef COMMON_H
#define COMMON_H
#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
@ -607,4 +612,9 @@ extern int gotoblas_profile;
#define PRINT_DEBUG_NAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME)
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif

View File

@ -60,4 +60,8 @@ float _Complex BLASFUNC_REF(cdotc) (blasint *, float *, blasint *, float *,
double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *);
double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *);
double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*);
#endif

View File

@ -1302,24 +1302,25 @@ int get_coretype(void){
case 13:
return CORE_DUNNINGTON;
}
break;
case 2:
switch (model) {
case 5:
//Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
return CORE_NEHALEM;
case 12:
//Xeon Processor 5600 (Westmere-EP)
return CORE_NEHALEM;
}
break;
break;
case 2:
switch (model) {
case 5:
//Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
return CORE_NEHALEM;
case 12:
//Xeon Processor 5600 (Westmere-EP)
return CORE_NEHALEM;
}
break;
}
break;
case 15:
if (model <= 0x2) return CORE_NORTHWOOD;
return CORE_PRESCOTT;
if (model <= 0x2) return CORE_NORTHWOOD;
else return CORE_PRESCOTT;
}
}

View File

@ -6,7 +6,7 @@ COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX)
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
ifdef SMP
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
ifndef NO_AFFINITY
COMMONOBJS += init.$(SUFFIX)
endif
@ -100,6 +100,9 @@ memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h
blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h
$(CC) $(CFLAGS) -c $< -o $(@F)
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
$(CC) $(CFLAGS) -c $< -o $(@F)
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)

View File

@ -38,7 +38,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
//#include <sys/mman.h>
#include "common.h"
#ifndef USE_OPENMP
@ -49,6 +49,26 @@
int blas_server_avail = 0;
void goto_set_num_threads(int num_threads) {
if (num_threads < 1) num_threads = blas_num_threads;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (num_threads > blas_num_threads) {
blas_num_threads = num_threads;
}
blas_cpu_number = num_threads;
omp_set_num_threads(blas_cpu_number);
}
void openblas_set_num_threads(int num_threads) {
goto_set_num_threads(num_threads);
}
int blas_thread_init(void){
blas_get_cpu_number();

View File

@ -172,13 +172,20 @@ static inline int rcount(unsigned long number) {
return count;
}
/***
Known issue: The number of CPUs/cores should less
than sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
***/
static inline unsigned long get_cpumap(int node) {
int infile;
unsigned long affinity;
char name[160];
char cpumap[160];
char *p, *dummy;
int i=0;
sprintf(name, CPUMAP_NAME, node);
infile = open(name, O_RDONLY);
@ -187,13 +194,19 @@ static inline unsigned long get_cpumap(int node) {
if (infile != -1) {
read(infile, name, sizeof(name));
read(infile, cpumap, sizeof(cpumap));
p = cpumap;
while (*p != '\n' && i<160){
if(*p != ',') {
name[i++]=*p;
}
p++;
}
p = name;
while ((*p == '0') || (*p == ',')) p++;
// while ((*p == '0') || (*p == ',')) p++;
affinity = strtol(p, &dummy, 16);
affinity = strtoul(p, &dummy, 16);
close(infile);
}
@ -347,7 +360,13 @@ static void disable_hyperthread(void) {
unsigned long share;
int cpu;
common -> avail = (1UL << common -> num_procs) - 1;
if(common->num_procs > 64){
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs);
exit(1);
}else if(common->num_procs == 64){
common -> avail = 0xFFFFFFFFFFFFFFFFUL;
}else
common -> avail = (1UL << common -> num_procs) - 1;
#ifdef DEBUG
fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail);
@ -376,7 +395,13 @@ static void disable_affinity(void) {
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]);
#endif
lprocmask = (1UL << common -> final_num_procs) - 1;
if(common->final_num_procs > 64){
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs);
exit(1);
}else if(common->final_num_procs == 64){
lprocmask = 0xFFFFFFFFFFFFFFFFUL;
}else
lprocmask = (1UL << common -> final_num_procs) - 1;
#ifndef USE_OPENMP
lprocmask &= *(unsigned long *)&cpu_orig_mask[0];

View File

@ -0,0 +1,45 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
#ifdef SMP_SERVER
#ifdef OS_LINUX
extern void openblas_set_num_threads(int num_threads) ;
void NAME(int* num_threads){
openblas_set_num_threads(*num_threads);
}
#endif
#endif

View File

@ -74,20 +74,21 @@ void gotoblas_profile_quit(void) {
if (cycles > 0) {
fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n");
fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle\n");
fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle Wall Time(Cycles)\n");
for (i = 0; i < MAX_PROF_TABLE; i ++) {
if (function_profile_table[i].calls) {
#ifndef OS_WINDOWS
fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f\n",
fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n",
#else
fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f\n",
fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n",
#endif
func_table[i],
function_profile_table[i].calls,
(double)function_profile_table[i].cycles / (double)cycles * 100.,
(double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100.,
(double)function_profile_table[i].area / (double)function_profile_table[i].cycles
(double)function_profile_table[i].area / (double)function_profile_table[i].cycles,
function_profile_table[i].cycles
);
}
}

View File

@ -53,18 +53,19 @@ dyn : $(LIBDYNNAME)
zip : dll
zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME)
dll : libgoto2.dll
dll : ../$(LIBDLLNAME)
#libgoto2.dll
dll2 : libgoto2_shared.dll
libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1)
$(DLLWRAP) -o $(@F) --def libgoto2.def \
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:i386 /def:libgoto2.def
else
$(DLLWRAP) -o $(@F) --def libgoto2.def \
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:X64 /def:libgoto2.def
endif
@ -84,7 +85,7 @@ libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
$(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
symbol.$(SUFFIX) : symbol.S
$(CC) $(CFLAGS) -c -o $(@F) $^

View File

@ -274,6 +274,7 @@ if ($link ne "") {
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /^\-l$/)
) {
$linker_l .= $flags . " ";
}

View File

@ -604,30 +604,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef POWER
#define POWER
#endif
#define OPENBLAS_SUPPORTED
#endif
#if defined(__i386__) || (__x86_64__)
#include "cpuid_x86.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef __ia64__
#include "cpuid_ia64.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef __alpha
#include "cpuid_alpha.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef POWER
#include "cpuid_power.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef sparc
#include "cpuid_sparc.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef __mips__
#include "cpuid_mips.c"
#define OPENBLAS_SUPPORTED
#endif
#ifndef OPENBLAS_SUPPORTED
#error "This arch/CPU is not supported by OpenBLAS."
#endif
#else

View File

@ -30,6 +30,10 @@ int main(int argc, char **argv) {
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double)));
#ifdef USE64BITINT
printf("#define USE64BITINT\n");
#endif
}
return 0;

View File

@ -85,7 +85,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
//Temporarily walk around the low performance issue with small imput size & multithreads.
if (n <= 10000)
nthreads = 1;
if (nthreads == 1) {
#endif

0
interface/create Normal file → Executable file
View File

View File

@ -49,6 +49,7 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
double ret = 0.0;
PRINT_DEBUG_NAME;
@ -61,19 +62,21 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;
return DSDOT_K(n, x, incx, y, incy);
ret=DSDOT_K(n, x, incx, y, incy);
FUNCTION_PROFILE_END(1, n, n);
IDEBUG_END;
return 0;
return ret;
}
#else
double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){
double ret = 0.0;
PRINT_DEBUG_CNAME;
@ -86,13 +89,13 @@ double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;
return DSDOT_K(n, x, incx, y, incy);
ret=DSDOT_K(n, x, incx, y, incy);
FUNCTION_PROFILE_END(1, n, n);
IDEBUG_END;
return 0;
return ret;
}

View File

@ -7,6 +7,12 @@
#define GAMSQ 16777216.e0
#define RGAMSQ 5.9604645e-8
#ifdef DOUBLE
#define ABS(x) fabs(x)
#else
#define ABS(x) fabsf(x)
#endif
#ifndef CBLAS
void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){
@ -47,7 +53,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
dq2 = dp2 * dy1;
dq1 = dp1 * *dx1;
if (! (abs(dq1) > abs(dq2))) goto L40;
if (! (ABS(dq1) > ABS(dq2))) goto L40;
dh21 = -(dy1) / *dx1;
dh12 = dp2 / dp1;
@ -140,7 +146,7 @@ L150:
goto L130;
L160:
if (! (abs(*dd2) <= RGAMSQ)) {
if (! (ABS(*dd2) <= RGAMSQ)) {
goto L190;
}
if (*dd2 == ZERO) {
@ -157,7 +163,7 @@ L180:
goto L160;
L190:
if (! (abs(*dd2) >= GAMSQ)) {
if (! (ABS(*dd2) >= GAMSQ)) {
goto L220;
}
igo = 3;

View File

@ -53,6 +53,11 @@ SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX)
CCOMMON_OPT += -DTS=$(TSUFFIX)
endif
KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h
ifneq ($(NO_LAPACK), 1)
KERNEL_INTERFACE += ../common_lapack.h
endif
ifeq ($(ARCH), x86)
COMMONOBJS += cpuid.$(SUFFIX)
endif
@ -88,9 +93,10 @@ setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h
setparam$(TSUFFIX).c : setparam-ref.c
sed 's/TS/$(TSUFFIX)/g' $< > $(@F)
kernel$(TSUFFIX).h : ../common_level1.h ../common_level2.h ../common_level3.h ../common_lapack.h
kernel$(TSUFFIX).h : $(KERNEL_INTERFACE)
sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F)
cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S
$(CC) -c $(CFLAGS) $< -o $(@F)
@ -112,10 +118,10 @@ lsame.$(PSUFFIX): $(KERNELDIR)/$(LSAME_KERNEL)
cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S
$(CC) -c $(PFLAGS) $< -o $(@F)
ifdef DYNAMIC_ARCH
#ifdef DYNAMIC_ARCH
clean ::
@rm -f setparam_*.c kernel_*.h setparam.h kernel.h
endif
#endif
include $(TOPDIR)/Makefile.tail

View File

@ -668,7 +668,7 @@ $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

View File

@ -300,7 +300,11 @@
.align 3
.L999:
j $31
ADD s1, s1, s2
#ifdef DSDOT
cvt.d.s s1, s1
#endif
j $31
NOP
EPILOGUE

View File

@ -101,7 +101,11 @@ gotoblas_t TABLE_NAME = {
#endif
ssymm_outcopyTS, ssymm_oltcopyTS,
#ifndef NO_LAPACK
sneg_tcopyTS, slaswp_ncopyTS,
#else
NULL,NULL,
#endif
0, 0, 0,
DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
@ -147,7 +151,11 @@ gotoblas_t TABLE_NAME = {
#endif
dsymm_outcopyTS, dsymm_oltcopyTS,
#ifndef NO_LAPACK
dneg_tcopyTS, dlaswp_ncopyTS,
#else
NULL, NULL,
#endif
#ifdef EXPRECISION
@ -195,7 +203,11 @@ gotoblas_t TABLE_NAME = {
#endif
qsymm_outcopyTS, qsymm_oltcopyTS,
#ifndef NO_LAPACK
qneg_tcopyTS, qlaswp_ncopyTS,
#else
NULL, NULL,
#endif
#endif
@ -286,7 +298,11 @@ gotoblas_t TABLE_NAME = {
chemm3m_oucopyrTS, chemm3m_olcopyrTS,
chemm3m_oucopyiTS, chemm3m_olcopyiTS,
#ifndef NO_LAPACK
cneg_tcopyTS, claswp_ncopyTS,
#else
NULL, NULL,
#endif
0, 0, 0,
ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),
@ -375,7 +391,11 @@ gotoblas_t TABLE_NAME = {
zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
zhemm3m_oucopyiTS, zhemm3m_olcopyiTS,
#ifndef NO_LAPACK
zneg_tcopyTS, zlaswp_ncopyTS,
#else
NULL, NULL,
#endif
#ifdef EXPRECISION
@ -466,7 +486,11 @@ gotoblas_t TABLE_NAME = {
xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
xhemm3m_oucopyiTS, xhemm3m_olcopyiTS,
#ifndef NO_LAPACK
xneg_tcopyTS, xlaswp_ncopyTS,
#else
NULL, NULL,
#endif
#endif

View File

@ -1541,5 +1541,8 @@
popl %ebx
popl %esi
popl %edi
/*remove the hidden return value address from the stack.*/
popl %ecx
xchgl %ecx, 0(%esp)
ret
EPILOGUE

View File

@ -1286,6 +1286,10 @@
haddps %xmm0, %xmm0
#endif
#ifdef DSDOT
cvtss2sd %xmm0, %xmm0
#endif
RESTOREREGISTERS
ret

View File

@ -544,7 +544,7 @@
jg .L11
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $1, KK
addq $4, KK
#endif
leaq (C, LDC, 4), C
@ -594,7 +594,7 @@
jg .L11
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $1, KK
addq $4, KK
#endif
leaq (C, LDC, 4), C

View File

@ -0,0 +1,21 @@
/*This is only for "make install" target.*/
#ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_
#else
#define BLASFUNC(FUNC) FUNC
#endif
#if defined(OS_WINDOWS) && defined(__64BIT__)
typedef long long BLASLONG;
typedef unsigned long long BLASULONG;
#else
typedef long BLASLONG;
typedef unsigned long BLASULONG;
#endif
#ifdef USE64BITINT
typedef BLASLONG blasint;
#else
typedef int blasint;
#endif

View File

@ -128,6 +128,8 @@ CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS)
XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
ifneq ($(NO_LAPACK), 1)
SBLASOBJS += \
sgetf2f.$(SUFFIX) sgetrff.$(SUFFIX) slauu2f.$(SUFFIX) slauumf.$(SUFFIX) \
spotf2f.$(SUFFIX) spotrff.$(SUFFIX) strti2f.$(SUFFIX) strtrif.$(SUFFIX) \
@ -160,6 +162,7 @@ XBLASOBJS +=
xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \
xlaswpf.$(SUFFIX) xgetrsf.$(SUFFIX) xgesvf.$(SUFFIX) xpotrif.$(SUFFIX) \
endif
include $(TOPDIR)/Makefile.tail

View File

@ -5,12 +5,12 @@ include $(TOPDIR)/Makefile.system
TARGET=openblas_utest
CUNIT_LIB=/usr/local/lib/libcunit.a
OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o
OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o
all : run_test
$(TARGET): $(OBJS)
$(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB)
$(FC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB)
run_test: $(TARGET)
./$(TARGET)

View File

@ -57,4 +57,8 @@ void test_caxpy_inc_0(void);
void test_zdotu_n_1(void);
void test_zdotu_offset_1(void);
void test_drotmg(void);
void test_dsdot_n_1(void);
#endif

View File

@ -54,7 +54,10 @@ CU_TestInfo test_level1[]={
{"Testing zdotu with n == 1",test_zdotu_n_1},
{"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1},
{"Testing drotmg",test_drotmg},
{"Testing dsdot with n == 1",test_dsdot_n_1},
CU_TEST_INFO_NULL,
};

50
utest/test_dsdot.c Normal file
View File

@ -0,0 +1,50 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common_utest.h"
void test_dsdot_n_1()
{
float x= 0.172555164;
float y= -0.0138700781;
int incx=1;
int incy=1;
int n=1;
double res1=0.0f, res2=0.0f;
res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy);
res2=BLASFUNC_REF(dsdot)(&n, &x, &incx, &y, &incy);
CU_ASSERT_DOUBLE_EQUAL(res1, res2, CHECK_EPS);
}

60
utest/test_rotmg.c Normal file
View File

@ -0,0 +1,60 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common_utest.h"
void test_drotmg()
{
double te_d1, tr_d1;
double te_d2, tr_d2;
double te_x1, tr_x1;
double te_y1, tr_y1;
double te_param[5],tr_param[5];
int i=0;
te_d1= tr_d1=0.21149573940783739;
te_d2= tr_d2=0.046892057172954082;
te_x1= tr_x1=-0.42272687517106533;
te_y1= tr_y1=0.42211309121921659;
//OpenBLAS
BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
//reference
BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param);
CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS);
CU_ASSERT_DOUBLE_EQUAL(te_y1, tr_y1, CHECK_EPS);
for(i=0; i<5; i++){
CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS);
}
}