From 0b4602b753812100abb556f9c84c600231b6a940 Mon Sep 17 00:00:00 2001 From: Tony Kelman Date: Fri, 24 Oct 2014 22:27:00 -0700 Subject: [PATCH 1/8] add SYMBOLPREFIX and SYMBOLSUFFIX makefile options for adding a prefix or suffix to all exported symbol names in the shared library Useful to avoid conflicts with other BLAS libraries, especially when using 64 bit integer interfaces in OpenBLAS Note that since OSX does not have the objcopy utility, setting these options to non-empty values on Mac requires the objconv tool, available (GPL license) from http://www.agner.org/optimize/#objconv --- Makefile.system | 10 +++++ exports/Makefile | 40 ++++++++++++++---- exports/gensymbol | 102 +++++++++++++++++++++++++++++++++------------- 3 files changed, 116 insertions(+), 36 deletions(-) diff --git a/Makefile.system b/Makefile.system index d2ff74146..ec6339d62 100644 --- a/Makefile.system +++ b/Makefile.system @@ -186,6 +186,8 @@ LD = $(CROSS_SUFFIX)ld RANLIB = $(CROSS_SUFFIX)ranlib NM = $(CROSS_SUFFIX)nm DLLWRAP = $(CROSS_SUFFIX)dllwrap +OBJCOPY = $(CROSS_SUFFIX)objcopy +OBJCONV = $(CROSS_SUFFIX)objconv # # OS dependent settings @@ -845,6 +847,14 @@ else LIBPREFIX = libopenblas_$(LIBNAMESUFFIX) endif +ifndef SYMBOLPREFIX +SYMBOLPREFIX = +endif + +ifndef SYMBOLSUFFIX +SYMBOLSUFFIX = +endif + KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) diff --git a/exports/Makefile b/exports/Makefile index c798bc777..f2f688191 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -88,12 +88,18 @@ dll : ../$(LIBDLLNAME) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) libopenblas.def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) +ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def +else +../$(LIBNAME).renamed : ../$(LIBNAME) objconv.def + $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed +$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def +endif $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) dllinit.$(SUFFIX) : dllinit.c @@ -103,16 +109,22 @@ ifeq ($(OSNAME), Linux) so : ../$(LIBSONAME) +ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) ../$(LIBSONAME) : ../$(LIBNAME) linktest.c +else +../$(LIBNAME).renamed : ../$(LIBNAME) objcopy.def + $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed +../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c +endif ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ - -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. else #for LSB env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ - -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. endif @@ -125,9 +137,15 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) so : ../$(LIBSONAME) +ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) ../$(LIBSONAME) : ../$(LIBNAME) linktest.c +else +../$(LIBNAME).renamed : ../$(LIBNAME) objcopy.def + $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed +../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c +endif $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ - -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,--whole-archive $< -Wl,--no-whole-archive \ $(FEXTRALIB) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest @@ -178,17 +196,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) + +objcopy.def : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) + +objconv.def : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* diff --git a/exports/gensymbol b/exports/gensymbol index bcea83667..8bd2f17af 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -2784,22 +2784,26 @@ $bu = $ARGV[2]; $bu = "" if (($bu eq "0") || ($bu eq "1")); +$symbolprefix = $ARGV[9]; + +$symbolsuffix = $ARGV[10]; + if ($ARGV[0] eq "osx"){ @underscore_objs = (@underscore_objs, @misc_common_objs); @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); foreach $objs (@underscore_objs) { - print "_", $objs, $bu, "\n"; + print "_", $symbolprefix, $objs, $bu, $symbolsuffix, "\n"; } foreach $objs (@need_2underscore_objs) { - print "_", $objs, $bu, $bu, "\n"; + print "_", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n"; } # if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { - print "_", $objs, "\n"; + print "_", $symbolprefix, $objs, $symbolsuffix, "\n"; } # } exit(0); @@ -2811,16 +2815,58 @@ if ($ARGV[0] eq "aix"){ @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); foreach $objs (@underscore_objs) { - print $objs, $bu, "\n"; + print $symbolprefix, $objs, $bu, $symbolsuffix, "\n"; } foreach $objs (@need_2underscore_objs) { - print $objs, $bu, $bu, "\n"; + print $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n"; } # if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { - print $objs, "\n"; + print $symbolprefix, $objs, $symbolsuffix, "\n"; + } +# } + exit(0); +} + +if ($ARGV[0] eq "objcopy"){ + + @underscore_objs = (@underscore_objs, @misc_common_objs); + @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); + + foreach $objs (@underscore_objs) { + print $objs, $bu, " ", $symbolprefix, $objs, $bu, $symbolsuffix, "\n"; + } + + foreach $objs (@need_2underscore_objs) { + print $objs, $bu, $bu, " ", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n"; + } + +# if ($ARGV[4] == 0) { + foreach $objs (@no_underscore_objs) { + print $objs, " ", $symbolprefix, $objs, $symbolsuffix, "\n"; + } +# } + exit(0); +} + +if ($ARGV[0] eq "objconv"){ + + @underscore_objs = (@underscore_objs, @misc_common_objs); + @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); + + foreach $objs (@underscore_objs) { + print "-nr:_", $objs, $bu, ":_", $symbolprefix, $objs, $bu, $symbolsuffix, "\n"; + } + + foreach $objs (@need_2underscore_objs) { + print "-nr:_", $objs, $bu, $bu, ":_", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n"; + } + +# if ($ARGV[4] == 0) { + foreach $objs (@no_underscore_objs) { + print "-nr:_", $objs, ":_", $symbolprefix, $objs, $symbolsuffix, "\n"; } # } exit(0); @@ -2835,22 +2881,22 @@ if ($ARGV[0] eq "win2k"){ foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; - print "\t$objs=$objs","_ \@", $count, "\n"; + print "\t",$symbolprefix, $objs, $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; - print "\t",$objs, "_=$objs","_ \@", $count, "\n"; + print "\t",$symbolprefix, $objs, "_", $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; - print "\t$uppercase=$objs", "_ \@", $count, "\n"; + print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "_ \@", $count, "\n"; $count ++; } foreach $objs (@need_2underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; - print "\t$objs=$objs","__ \@", $count, "\n"; + print "\t",$symbolprefix, $objs, $symbolsuffix, "=$objs","__ \@", $count, "\n"; $count ++; - print "\t",$objs, "__=$objs","__ \@", $count, "\n"; + print "\t",$symbolprefix, $objs, "__", $symbolsuffix, "=$objs","__ \@", $count, "\n"; $count ++; - print "\t$uppercase=$objs", "__ \@", $count, "\n"; + print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "__ \@", $count, "\n"; $count ++; } @@ -2859,15 +2905,15 @@ if ($ARGV[0] eq "win2k"){ $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; - print "\t",$objs, "_=$objs","_ \@", $count, "\n"; + print "\t",$symbolprefix, $objs, "_", $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; - print "\t$uppercase=$objs", "_ \@", $count, "\n"; + print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "_ \@", $count, "\n"; $count ++; } foreach $objs (@no_underscore_objs) { - print "\t",$objs,"=$objs"," \@", $count, "\n"; + print "\t",$symbolprefix,$objs,$symbolsuffix,"=$objs"," \@", $count, "\n"; $count ++; } @@ -2880,11 +2926,11 @@ if ($ARGV[0] eq "win2khpl"){ foreach $objs (@hplobjs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; - print "\t$objs=$objs","_ \@", $count, "\n"; + print "\t",$symbolprefix, $objs, $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; - print "\t",$objs, "_=$objs","_ \@", $count, "\n"; + print "\t",$symbolprefix, $objs, "_", $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; - print "\t$uppercase=$objs", "_ \@", $count, "\n"; + print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "_ \@", $count, "\n"; $count ++; } @@ -2905,24 +2951,24 @@ if ($ARGV[0] eq "microsoft"){ foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; - print "\t$objs = $objs","_\n"; + print "\t",$symbolprefix, $objs, $symbolsuffix, " = $objs","_\n"; $count ++; - print "\t$objs\_ = $objs","_\n"; + print "\t",$symbolprefix, $objs, "\_", $symbolsuffix, " = $objs","_\n"; $count ++; - print "\t$uppercase = $objs","_\n"; + print "\t",$symbolprefix, $uppercase, $symbolsuffix, " = $objs","_\n"; $count ++; - print "\t$uppercase\_ = $objs","_\n"; + print "\t",$symbolprefix, $uppercase, "\_", $symbolsuffix, " = $objs","_\n"; $count ++; } foreach $objs (@need_2underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; - print "\t$objs=$objs","__ \@", $count, "\n"; + print "\t",$symbolprefix, $objs, $symbolsuffix, "=$objs","__ \@", $count, "\n"; $count ++; - print "\t",$objs, "__=$objs","__ \@", $count, "\n"; + print "\t",$symbolprefix, $objs, "__", $symbolsuffix, "=$objs","__ \@", $count, "\n"; $count ++; - print "\t$uppercase=$objs", "__ \@", $count, "\n"; + print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "__ \@", $count, "\n"; $count ++; } @@ -2936,16 +2982,16 @@ if ($ARGV[0] eq "linktest"){ print "int main(void){\n"; foreach $objs (@underscore_objs) { - print $objs, $bu, "();\n" if $objs ne "xerbla"; + print $symbolprefix, $objs, $bu, $symbolsuffix, "();\n" if $objs ne "xerbla"; } foreach $objs (@need_2underscore_objs) { - print $objs, $bu, $bu, "();\n"; + print $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "();\n"; } # if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { - print $objs, "();\n"; + print $symbolprefix, $objs, $symbolsuffix, "();\n"; } # } From 695e0fa649cd76b894069990cdbfd5590f16c401 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 10 Nov 2014 14:39:56 +0800 Subject: [PATCH 2/8] #463 fixed a compiling bug on AIX. --- driver/others/openblas_get_parallel.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/driver/others/openblas_get_parallel.c b/driver/others/openblas_get_parallel.c index ea2e4d986..76107dabd 100644 --- a/driver/others/openblas_get_parallel.c +++ b/driver/others/openblas_get_parallel.c @@ -40,6 +40,8 @@ static int parallel = 1; static int parallel = 0; #endif + +#ifdef NEEDBUNDERSCORE int CNAME() { return parallel; } @@ -48,5 +50,10 @@ int NAME() { return parallel; } - +#else +//The CNAME and NAME are the same. +int NAME() { + return parallel; +} +#endif From 2987bc7b40e0538f38b130a5035a210fc4a89199 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 10 Nov 2014 17:15:34 +0800 Subject: [PATCH 3/8] refs #464. Fixed the bug of detecting L2 associative on x86. --- cpuid_x86.c | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index f9df7221b..44446e582 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -853,11 +853,24 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ if (get_vendor() == VENDOR_INTEL) { cpuid(0x80000000, &cpuid_level, &ebx, &ecx, &edx); if (cpuid_level >= 0x80000006) { - cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + if(L2.size<=0){ + //If we didn't detect L2 correctly before, + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - L2.size = BITMASK(ecx, 16, 0xffff); - L2.associative = BITMASK(ecx, 12, 0x0f); - L2.linesize = BITMASK(ecx, 0, 0xff); + L2.size = BITMASK(ecx, 16, 0xffff); + L2.associative = BITMASK(ecx, 12, 0x0f); + + switch (L2.associative){ + case 0x06: + L2.associative = 8; + break; + case 0x08: + L2.associative = 16; + break; + } + + L2.linesize = BITMASK(ecx, 0, 0xff); + } } } @@ -916,10 +929,22 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ if (L2ITB.associative == 0xff) L2ITB.associative = 0; L2ITB.linesize = BITMASK(ebx, 0, 0xff); - L2.size = BITMASK(ecx, 16, 0xffff); - L2.associative = BITMASK(ecx, 12, 0xf); - if (L2.associative == 0xff) L2.associative = 0; - L2.linesize = BITMASK(ecx, 0, 0xff); + if(L2.size <= 0){ + //If we didn't detect L2 correctly before, + L2.size = BITMASK(ecx, 16, 0xffff); + L2.associative = BITMASK(ecx, 12, 0xf); + switch (L2.associative){ + case 0x06: + L2.associative = 8; + break; + case 0x08: + L2.associative = 16; + break; + } + + if (L2.associative == 0xff) L2.associative = 0; + L2.linesize = BITMASK(ecx, 0, 0xff); + } L3.size = BITMASK(edx, 18, 0x3fff) * 512; L3.associative = BITMASK(edx, 12, 0xf); From 58c90d5937cc5cc225e96cc60457401c07e07165 Mon Sep 17 00:00:00 2001 From: Benedikt Huber Date: Thu, 9 Oct 2014 06:52:10 -0700 Subject: [PATCH 4/8] # The first commit's message is: Optimizations for APM's xgene-1 (aarch64). 1) general system updates to support armv8 better. Make all did not work, one needed to supply TARGET=ARMV8. 2) sgem 4x4 kernel in assembler using SIMD, and configuration changes to use it. 3) strmm 4x4 kernel in C. Since the sgem kernel does 4x4, the trmm kernel must also do 4xN. Added Dave Nuechterlein to the contributors list. --- CONTRIBUTORS.md | 4 + common_arm64.h | 5 +- cpuid_arm64.c | 217 +++++ getarch.c | 18 +- kernel/arm64/KERNEL.ARMV8 | 8 +- kernel/arm64/sgemm_kernel_4x4.S | 1327 +++++++++++++++++++++++++++++++ kernel/generic/trmmkernel_4x4.c | 875 ++++++++++++++++++++ param.h | 4 +- 8 files changed, 2442 insertions(+), 16 deletions(-) create mode 100644 cpuid_arm64.c create mode 100644 kernel/arm64/sgemm_kernel_4x4.S create mode 100644 kernel/generic/trmmkernel_4x4.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 18a218cec..02d15b7f3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -117,5 +117,9 @@ In chronological order: * Isaac Dunham * [2014-08-03] Fixed link error on Linux/musl +* Dave Nuechterlein + * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1). + ARMv8 support. + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/common_arm64.h b/common_arm64.h index 8a66a1702..4855493da 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -119,9 +119,9 @@ static inline int blas_quickdivide(blasint x, blasint y){ } #if defined(DOUBLE) -#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory") #else -#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory") #endif #define GET_IMAGE_CANCEL @@ -138,7 +138,6 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ - .arm ;\ .global REALNAME ;\ .func REALNAME ;\ REALNAME: diff --git a/cpuid_arm64.c b/cpuid_arm64.c new file mode 100644 index 000000000..c7a27f891 --- /dev/null +++ b/cpuid_arm64.c @@ -0,0 +1,217 @@ +/************************************************************************** + Copyright (c) 2013, The OpenBLAS Project + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_ARMV8 1 + +static char *cpuname[] = { + "UNKOWN", + "ARMV8" +}; + + +int get_feature(char *search) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + + +int detect(void) +{ + +#ifdef linux + + FILE *infile; + char buffer[512], *p; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9))) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if(p != NULL) + { + + if (strstr(p, "AArch64")) + { + return CPU_ARMV8; + + } + + + } +#endif + + return CPU_UNKNOWN; +} + +char *get_corename(void) +{ + return cpuname[detect()]; +} + +void get_architecture(void) +{ + printf("ARM"); +} + +void get_subarchitecture(void) +{ + int d = detect(); + switch (d) + { + + case CPU_ARMV8: + printf("ARMV8"); + break; + + default: + printf("UNKNOWN"); + break; + } +} + +void get_subdirname(void) +{ + printf("arm64"); +} + +void get_cpuconfig(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV8: + printf("#define ARMV8\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + + } +} + + +void get_libname(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV8: + printf("armv8\n"); + break; + + } +} + + +void get_features(void) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + } + +#endif + return; +} + + diff --git a/getarch.c b/getarch.c index 3e9914259..ded347ecc 100644 --- a/getarch.c +++ b/getarch.c @@ -746,12 +746,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SUBARCHITECTURE "ARMV8" #define SUBDIRNAME "arm64" #define ARCHCONFIG "-DARMV8 " \ - "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ - "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ - "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4" + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " #define LIBNAME "armv8" -#define CORENAME "ARMV8" +#define CORENAME "XGENE1" #else #endif @@ -801,6 +800,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __aarch64__ +#include "cpuid_arm64.c" +#define OPENBLAS_SUPPORTED +#endif + #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." @@ -856,7 +860,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -956,7 +960,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 27157dad1..4fc0968cd 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -80,14 +80,14 @@ DGEMVTKERNEL = ../arm/gemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c +STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S new file mode 100644 index 000000000..78633297f --- /dev/null +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -0,0 +1,1327 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/11/02 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 240 +* DGEMM_R 12288 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 +* +* Performance on Odroid U2: +* +* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS +* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS +* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS +* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/ + + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define pB x10 +#define counterJ x11 +#define tempALPHA x12 +#define pCRow0 x13 +#define pCRow1 x14 +#define pCRow2 x15 +#define pA x16 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 pB +// 11 counterJ +// 12 tempALPHA +// 13 pCRow0 +// 14 pCRow1 +// 15 pCRow2 +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 orig ALPHA -> a00 +//v01 a01 +//v02 a02 +//v03 a03 +//v04 a10 +//v05 a11 +//v06 a12 +//v07 a13 +//v08 must save b00 +//v09 must save b01 +//v10 must save b02 +//v11 must save b03 +//v12 must save b10 +//v13 must save b11 +//v14 must save b12 +//v15 must save b13 +//v16 must save C00 +//v17 must save C01 +//v18 C02 +//v19 C03 +//v20 C10 +//v21 C11 +//v22 C12 +//v23 C13 +//v24 C20 +//v25 C21 +//v26 C22 +//v27 C23 +//v28 C30 +//v29 C31 +//v30 C32 +//v31 C33 + +// add sp,sp,#-(6*16) +// stp x18,x19,[sp,#(0*16)] +// stp x20,x21,[sp,#(1*16)] + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + fsub v16.4s , v16.4s , v16.4s + fsub v20.4s , v20.4s , v20.4s + fsub v24.4s , v24.4s , v24.4s + fsub v28.4s , v28.4s , v28.4s + +.endm + +.macro KERNEL4x4_I + + ld1 {v8.2s},[pB],#8 + ld1 {v10.2s},[pB],#8 + ld1 {v0.4s},[pA],#16 + + fmulx v16.4s, v0.4s, v8.4s[0] + fmulx v20.4s, v0.4s, v8.4s[1] + fmulx v24.4s, v0.4s, v10.4s[0] + fmulx v28.4s, v0.4s, v10.4s[1] + + ld1 {v12.2s},[pB],#8 // for next round + ld1 {v14.2s},[pB],#8 // for next round + ld1 {v4.4s},[pA],#16 // for next round + + +.endm + + +.macro KERNEL4x4_M2 + + fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v14.s[1] + + ld1 {v8.2s},[pB],#8 + ld1 {v10.2s},[pB],#8 + ld1 {v0.4s},[pA],#16 + +.endm + + +.macro KERNEL4x4_M1 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v10.s[1] + + ld1 {v12.2s},[pB],#8 + ld1 {v14.2s},[pB],#8 + ld1 {v4.4s},[pA],#16 + +.endm + + + +.macro KERNEL4x4_E + + fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v14.s[1] + +.endm + + + + +.macro KERNEL4x4_SUB + + ld1 {v8.2s},[pB],#8 + ld1 {v10.2s},[pB],#8 + ld1 {v0.4s} , [pA],#16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v10.s[1] + +.endm + + + + +.macro SAVE4x4 + + add pCRow1, pCRow0, LDC // create a second row pointer from the first row pointer + mov v0.d[0], tempALPHA + + ld1 {v8.4s},[pCRow0] // load 4 values of C from first row + fmla v8.4s ,v16.4s,v0.s[0] + st1 {v8.4s},[pCRow0],#16 // store C from first row + + ld1 {v12.4s},[pCRow1] // load 4 values of C from second row + fmla v12.4s ,v20.4s,v0.s[0] + st1 {v12.4s},[pCRow1] // store C from second row + + add pCRow2, pCRow1, LDC // Row2 points to third row + + ld1 {v8.4s},[pCRow2] // load 4 values of C from third row + fmla v8.4s ,v24.4s,v0.s[0] + st1 {v8.4s} ,[pCRow2] // store C from third row + + add pCRow1, pCRow2 , LDC // row1 points to fourth row + + ld1 {v12.4s},[pCRow1] // load 4 values of C from fourth row + fmla v12.4s ,v28.4s,v0.s[0] + st1 {v12.4s},[pCRow1] // store fourth row + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s20, s16 + fmov s21, s16 + fmov s24, s16 + fmov s25, s16 + fmov s28, s16 + fmov s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + ldr s10, [ pB, #8 ] + ldr s11, [ pB, #12 ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + + fmadd s20 , s0, s9, s20 + fmadd s21 , s1, s9, s21 + + fmadd s24 , s0, s10, s24 + fmadd s25 , s1, s10, s25 + + fmadd s28 , s0, s11, s28 + fmadd s29 , s1, s11, s29 + add pA , pA, #8 + add pB , pB, #16 + +.endm + + #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1 + #define L1ST( op1, op2, op3) ldr op1, [op2, op3] + +.macro SAVE2x4 + + add pCRow1 , pCRow0, LDC + add pCRow2 , pCRow1, LDC + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + L1ST ( s9,pCRow0, #4 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + + str s8 , [pCRow0, #0] + str s9 , [pCRow0, #4 ] + + ldr s12, [pCRow1, #0] + ldr s13, [pCRow1, #4 ] + + F1ST ( s12, s0 , s20) + F1ST ( s13, s0 , s21) + + str s12, [pCRow1, #0] + str s13, [pCRow1, #4 ] + + L1ST ( s8,pCRow2 , #0) + L1ST ( s9,pCRow2 , #4 ) + + F1ST ( s8 , s0 , s24) + F1ST ( s9 , s0 , s25) + + str s8 , [pCRow2 , #0] + str s9 , [pCRow2 , #4 ] + + add pCRow1, pCRow2 , LDC + + ldr s12, [pCRow1, #0] + ldr s13, [pCRow1, #4 ] + + F1ST ( s12, s0 , s28) + F1ST ( s13, s0 , s29) + + str s12, [pCRow1, #0] + str s13, [pCRow1, #4 ] + + add pCRow0, pCRow0, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + fsub s16 , s16 , s16 + fmov s20, s16 + fmov s24, s16 + fmov s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + ldr s10, [ pB, #8 ] + ldr s11, [ pB, #12 ] + + ldr s0 , [ pA ] + + fmadd s16 , s0, s8, s16 + fmadd s20 , s0, s9, s20 + fmadd s24 , s0, s10, s24 + fmadd s28 , s0, s11, s28 + + add pA , pA, #4 + add pB , pB, #16 + +.endm + +.macro SAVE1x4 + + add pCRow1 , pCRow0, LDC + add pCRow2 , pCRow1, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + F1ST ( s8 , s0 , s16) + str s8 , [pCRow0, #0] + + L1ST ( s12,pCRow1, #0) + F1ST ( s12, s0 , s20) + str s12, [pCRow1, #0] + + L1ST ( s8,pCRow2 , #0) + F1ST ( s8 , s0 , s24) + str s8 , [pCRow2 , #0] + + add pCRow1, pCRow2 , LDC + + L1ST ( s12,pCRow1, #0) + F1ST ( s12, s0 , s28) + str s12, [pCRow1, #0] + + add pCRow0, pCRow0, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s18, s16 + fmov s19, s16 + fmov s20, s16 + fmov s21, s16 + fmov s22, s16 + fmov s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + ldr s2 , [ pA, #8 ] + ldr s3 , [ pA, #12 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + fmadd s18 , s2, s8, s18 + fmadd s19 , s3, s8, s19 + + fmadd s20 , s0, s9, s20 + fmadd s21 , s1, s9, s21 + fmadd s22 , s2, s9, s22 + fmadd s23 , s3, s9, s23 + + add pA , pA, #16 + add pB , pB, #8 + +.endm + +.macro SAVE4x2 + + add pCRow1 , pCRow0, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + L1ST ( s9,pCRow0, #4 ) + L1ST ( s10,pCRow0, #8 ) + L1ST ( s11,pCRow0, #12 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + F1ST ( s10, s0 , s18) + F1ST ( s11, s0 , s19) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + str s10, [pCRow0, #8 ] + str s11, [pCRow0, #12 ] + + L1ST ( s12,pCRow1, #0) + L1ST ( s13,pCRow1, #4 ) + L1ST ( s14,pCRow1, #8 ) + L1ST ( s15,pCRow1, #12 ) + + F1ST ( s12, s0 , s20) + F1ST ( s13, s0 , s21) + F1ST ( s14, s0 , s22) + F1ST ( s15, s0 , s23) + + str s12, [pCRow1] + str s13, [pCRow1, #4 ] + str s14, [pCRow1, #8 ] + str s15, [pCRow1, #12 ] + + add pCRow0, pCRow0, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s20, s16 + fmov s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + + fmadd s20 , s0, s9, s20 + fmadd s21 , s1, s9, s21 + + add pA , pA, #8 + add pB , pB, #8 + +.endm + +.macro SAVE2x2 + + add pCRow1 , pCRow0, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + L1ST ( s9,pCRow0, #4 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + + L1ST ( s12,pCRow1, #0 ) + L1ST ( s13,pCRow1, #4 ) + + F1ST ( s12, s0 , s20) + F1ST ( s13, s0 , s21) + + str s12, [pCRow1] + str s13, [pCRow1, #4 ] + + add pCRow0, pCRow0, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + fsub s16 , s16 , s16 + fmov s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + + ldr s0 , [ pA ] + fmadd s16 , s0, s8, s16 + fmadd s20 , s0, s9, s20 + + add pA , pA, #4 + add pB , pB, #8 + +.endm + +.macro SAVE1x2 + + add pCRow1 , pCRow0, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + F1ST ( s8 , s0 , s16) + str s8 , [pCRow0] + + L1ST ( s12,pCRow1, #0) + F1ST ( s12, s0 , s20) + str s12, [pCRow1] + + add pCRow0, pCRow0, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s18, s16 + fmov s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + ldr s8 , [ pB ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + ldr s2 , [ pA, #8 ] + ldr s3 , [ pA, #12 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + fmadd s18 , s2, s8, s18 + fmadd s19 , s3, s8, s19 + + add pA , pA, #16 + add pB , pB, #4 + +.endm + +.macro SAVE4x1 + + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + L1ST ( s9,pCRow0, #4 ) + L1ST ( s10,pCRow0, #8 ) + L1ST ( s11,pCRow0, #12 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + F1ST ( s10, s0 , s18) + F1ST ( s11, s0 , s19) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + str s10, [pCRow0, #8 ] + str s11, [pCRow0, #12 ] + + add pCRow0, pCRow0, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + fsub s16 , s16 , s16 + fmov s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + ldr s8 , [ pB ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + + add pA , pA, #8 + add pB , pB, #4 + +.endm + +.macro SAVE2x1 + + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + L1ST ( s9,pCRow0, #4 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + + add pCRow0, pCRow0, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + fsub s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + ldr s8 , [ pB ] + + ldr s0 , [ pA ] + + fmadd s16 , s0, s8, s16 + + add pA , pA, #4 + add pB , pB, #4 + +.endm + +.macro SAVE1x1 + + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + F1ST ( s8 , s0 , s16) + str s8 , [pCRow0] + + add pCRow0, pCRow0, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + add sp,sp,#-(5*16) + stp d8,d9,[sp,#(0*16)] + stp d10,d11,[sp,#(1*16)] + stp d12,d13,[sp,#(2*16)] + stp d14,d15,[sp,#(3*16)] + stp d16,d17,[sp,#(4*16)] + + mov tempALPHA, v0.d[0] + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble sgemm_kernel_L2_BEGIN + +sgemm_kernel_L4_BEGIN: + + mov pCRow0, pC // pCRow0 = C + add pC,pC,LDC, lsl #2 + + mov pA, origPA // pA = start of A array + + + +sgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble sgemm_kernel_L4_M2_BEGIN + +sgemm_kernel_L4_M4_20: + + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M4_32 + + + + KERNEL4x4_I //do one in the K + KERNEL4x4_M2 //do another in the K + + subs counterL, counterL, #2 // subtract 2, since one is always done at the tail + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: // less than 4 to do in the K direction + + tst counterL, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + + +sgemm_kernel_L4_M4_40: + + INIT4x4 + + +sgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bne sgemm_kernel_L4_M4_46 + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L4_M4_END: + + subs counterI, counterI, #1 + bne sgemm_kernel_L4_M4_20 + + +sgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L4_M1_BEGIN + +sgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M2_40 + +sgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_22 + + +sgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M2_100 + +sgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_42 + +sgemm_kernel_L4_M2_100: + + SAVE2x4 + +sgemm_kernel_L4_M2_END: + + +sgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L4_END + +sgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M1_40 + +sgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_22 + + +sgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M1_100 + +sgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_42 + +sgemm_kernel_L4_M1_100: + + SAVE1x4 + + +sgemm_kernel_L4_END: + + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt sgemm_kernel_L4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble sgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble sgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pC , pC, LDC, lsl #1 + + mov pA, origPA // pA = A + + + +sgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L2_M4_20 + + +sgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + + +sgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/*********************************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble sgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pCRow0 , LDC // C01 is the current line, update pC to point to next + + mov pA, origPA // pA = A + + + +sgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L1_M4_20 + + +sgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + + +sgemm_kernel_L1_END: + + +sgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8,d9,[sp,#(0*16)] + ldp d10,d11,[sp,#(1*16)] + ldp d12,d13,[sp,#(2*16)] + ldp d14,d15,[sp,#(3*16)] + ldp d16,d17,[sp,#(4*16)] + add sp,sp,#(5*16) + ret + + EPILOGUE + diff --git a/kernel/generic/trmmkernel_4x4.c b/kernel/generic/trmmkernel_4x4.c new file mode 100644 index 000000000..a85828cad --- /dev/null +++ b/kernel/generic/trmmkernel_4x4.c @@ -0,0 +1,875 @@ +#include "common.h" +#include + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j Date: Mon, 24 Nov 2014 15:34:48 +0800 Subject: [PATCH 5/8] Refs #467. Added generic kernel file for x86_64. --- kernel/Makefile.L3 | 4 +++ kernel/x86_64/KERNEL.generic | 52 ++++++++++++++++++++++++++++++++++++ param.h | 14 +++++----- 3 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 kernel/x86_64/KERNEL.generic diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 268177c0f..5702b7ac8 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -28,6 +28,10 @@ ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif +ifeq ($(TARGET), GENERIC) +USE_TRMM = 1 +endif + SKERNELOBJS += \ diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic new file mode 100644 index 000000000..2bcd83636 --- /dev/null +++ b/kernel/x86_64/KERNEL.generic @@ -0,0 +1,52 @@ +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S diff --git a/param.h b/param.h index d7a427b65..e7977f898 100644 --- a/param.h +++ b/param.h @@ -2122,25 +2122,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL -#define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #ifdef ARCH_X86 -#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #else -#define SGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #endif From 2fb02626dacae6a3d85af15ada74415691f6205b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 25 Nov 2014 15:28:58 +0800 Subject: [PATCH 6/8] Update organization info. --- LICENSE | 9 ++++---- common_arm.h | 10 ++++----- common_arm64.h | 10 ++++----- common_mips64.h | 10 ++++----- common_reference.h | 9 ++++---- cpuid_mips.c | 9 ++++---- driver/others/blas_server.c | 9 ++++---- driver/others/init.c | 9 ++++---- driver/others/memory.c | 9 ++++---- driver/others/openblas_get_config.c | 9 ++++---- driver/others/openblas_get_parallel.c | 2 +- driver/others/openblas_set_num_threads.c | 9 ++++---- getarch.c | 9 ++++---- kernel/generic/zgemm_ncopy_4_sandy.c | 27 ++++++++++++------------ kernel/generic/zgemm_ncopy_8_sandy.c | 27 ++++++++++++------------ kernel/generic/zgemm_tcopy_4_sandy.c | 27 ++++++++++++------------ kernel/generic/zgemm_tcopy_8_sandy.c | 27 ++++++++++++------------ kernel/mips64/axpy_loongson3a.S | 9 ++++---- kernel/mips64/daxpy_loongson3a_simd.S | 9 ++++---- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 27 ++++++++++++------------ kernel/x86_64/dgemm_kernel_4x8_sandy.S | 27 ++++++++++++------------ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 27 ++++++++++++------------ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 27 ++++++++++++------------ param.h | 9 ++++---- utest/common_utest.h | 9 ++++---- utest/main.c | 9 ++++---- utest/test_amax.c | 9 ++++---- utest/test_axpy.c | 9 ++++---- utest/test_dotu.c | 9 ++++---- utest/test_dsdot.c | 9 ++++---- utest/test_fork.c | 11 +++++----- utest/test_rot.c | 9 ++++---- utest/test_rotmg.c | 9 ++++---- utest/test_swap.c | 9 ++++---- version.h | 9 ++++---- 35 files changed, 244 insertions(+), 213 deletions(-) diff --git a/LICENSE b/LICENSE index d15634e8a..22745000f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -12,9 +12,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/common_arm.h b/common_arm.h index 130100035..3cf15848a 100644 --- a/common_arm.h +++ b/common_arm.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -27,7 +28,6 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - **********************************************************************************/ /*********************************************************************/ diff --git a/common_arm64.h b/common_arm64.h index 4855493da..ae79c5309 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -27,7 +28,6 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - **********************************************************************************/ /*********************************************************************/ diff --git a/common_mips64.h b/common_mips64.h index aa85ff213..7cd86b375 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -27,7 +28,6 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - **********************************************************************************/ /*********************************************************************/ diff --git a/common_reference.h b/common_reference.h index 75bae1faa..93a511bbf 100644 --- a/common_reference.h +++ b/common_reference.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/cpuid_mips.c b/cpuid_mips.c index fad105747..22beff7fc 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index e2632c223..b3b1ce7bd 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/driver/others/init.c b/driver/others/init.c index 50a1a23f7..f134f85f7 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/driver/others/memory.c b/driver/others/memory.c index 9fdb18f69..16d68cced 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 0fecbf951..7d041b907 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/driver/others/openblas_get_parallel.c b/driver/others/openblas_get_parallel.c index 76107dabd..5dfda6e59 100644 --- a/driver/others/openblas_get_parallel.c +++ b/driver/others/openblas_get_parallel.c @@ -13,7 +13,7 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may + 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index ea0c70a91..0b57867b0 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/getarch.c b/getarch.c index ded347ecc..81ab9e37c 100644 --- a/getarch.c +++ b/getarch.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c index 404a3cd4a..195848a03 100644 --- a/kernel/generic/zgemm_ncopy_4_sandy.c +++ b/kernel/generic/zgemm_ncopy_4_sandy.c @@ -1,21 +1,22 @@ /***************************************************************************** - Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS - All rights reserved. +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software -without specific prior written permission. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/generic/zgemm_ncopy_8_sandy.c b/kernel/generic/zgemm_ncopy_8_sandy.c index 6e8e894b2..f37c861f8 100644 --- a/kernel/generic/zgemm_ncopy_8_sandy.c +++ b/kernel/generic/zgemm_ncopy_8_sandy.c @@ -1,21 +1,22 @@ /***************************************************************************** - Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS - All rights reserved. +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software -without specific prior written permission. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/generic/zgemm_tcopy_4_sandy.c b/kernel/generic/zgemm_tcopy_4_sandy.c index 7e148659d..71e5517cf 100644 --- a/kernel/generic/zgemm_tcopy_4_sandy.c +++ b/kernel/generic/zgemm_tcopy_4_sandy.c @@ -1,21 +1,22 @@ /***************************************************************************** - Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS - All rights reserved. +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software -without specific prior written permission. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/generic/zgemm_tcopy_8_sandy.c b/kernel/generic/zgemm_tcopy_8_sandy.c index e5197858e..dd6d7999d 100644 --- a/kernel/generic/zgemm_tcopy_8_sandy.c +++ b/kernel/generic/zgemm_tcopy_8_sandy.c @@ -1,21 +1,22 @@ /***************************************************************************** - Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS - All rights reserved. +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software -without specific prior written permission. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/mips64/axpy_loongson3a.S b/kernel/mips64/axpy_loongson3a.S index 801885e7e..5904bc580 100644 --- a/kernel/mips64/axpy_loongson3a.S +++ b/kernel/mips64/axpy_loongson3a.S @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/mips64/daxpy_loongson3a_simd.S b/kernel/mips64/daxpy_loongson3a_simd.S index 880a67f02..f54008bc2 100644 --- a/kernel/mips64/daxpy_loongson3a_simd.S +++ b/kernel/mips64/daxpy_loongson3a_simd.S @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 487f95936..f64f199b7 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -1,21 +1,22 @@ /***************************************************************************** - Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS - All rights reserved. +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software -without specific prior written permission. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index e86d30625..a52bb0788 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -1,21 +1,22 @@ /***************************************************************************** - Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS - All rights reserved. +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software -without specific prior written permission. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index fb67dee9c..2843efbae 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -1,21 +1,22 @@ /***************************************************************************** - Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS - All rights reserved. +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software -without specific prior written permission. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S index dbde1f0b5..e1176d131 100644 --- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -1,21 +1,22 @@ /***************************************************************************** - Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS - All rights reserved. +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software -without specific prior written permission. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/param.h b/param.h index e7977f898..28ed91e60 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/utest/common_utest.h b/utest/common_utest.h index 51f04cac7..e8377e681 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/utest/main.c b/utest/main.c index 7fb5811f8..f44008b79 100644 --- a/utest/main.c +++ b/utest/main.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/utest/test_amax.c b/utest/test_amax.c index fcc9343cf..3195a6ee4 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2012, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/utest/test_axpy.c b/utest/test_axpy.c index 0355973f5..696eb7a51 100644 --- a/utest/test_axpy.c +++ b/utest/test_axpy.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/utest/test_dotu.c b/utest/test_dotu.c index aef1005dc..4ecc95915 100644 --- a/utest/test_dotu.c +++ b/utest/test_dotu.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c index 41b62c2ea..536153c7e 100644 --- a/utest/test_dsdot.c +++ b/utest/test_dsdot.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/utest/test_fork.c b/utest/test_fork.c index 6e99d1444..e6603902e 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2014, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -120,4 +121,4 @@ void test_fork_safety(void) CU_ASSERT(WEXITSTATUS (child_status) == 0); } } -#endif \ No newline at end of file +#endif diff --git a/utest/test_rot.c b/utest/test_rot.c index 988f54e9c..b8f9f177a 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c index bb03c278a..b72446c1b 100644 --- a/utest/test_rotmg.c +++ b/utest/test_rotmg.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/utest/test_swap.c b/utest/test_swap.c index 30c2e7777..cf70079fb 100644 --- a/utest/test_swap.c +++ b/utest/test_swap.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/version.h b/version.h index 213faae00..d2b7f6560 100644 --- a/version.h +++ b/version.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,10 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE From fe7dcf98f3666388b75839c6b3e6656580dcd222 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 29 Nov 2014 02:16:40 +0800 Subject: [PATCH 7/8] Refs #461. Provide OpenBLASConfig.cmake to support CMake. If you "make PREFIX=/path/to/OpenBLAS install" , The config file will be located in /path/to/OpenBLAS/cmake Then, you can use "find_package(OpenBLAS)" at CMake. cmake -DOpenBLAS_DIR=/path/to/OpenBLAS/cmake .. --- Makefile.install | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index c7d1d0d11..04323eef5 100644 --- a/Makefile.install +++ b/Makefile.install @@ -9,6 +9,8 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) +OPENBLAS_CMAKE_DIR := $(PREFIX)/cmake +OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake .PHONY : install .NOTPARALLEL : install @@ -21,6 +23,7 @@ install : lib.grd @-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) #for inc @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @@ -90,6 +93,23 @@ ifeq ($(OSNAME), CYGWIN_NT) @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) endif endif - +#Generating OpenBLASConfig.cmake + @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) + @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) +ifndef NO_SHARED +#ifeq logical or +ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) +endif +ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) +endif +ifeq ($(OSNAME), Darwin) + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) +endif +else +#only static + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) +endif @echo Install OK! From f5424fc9de7238a8d1876a397dda458d864abadb Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 3 Dec 2014 23:00:29 +0800 Subject: [PATCH 8/8] Update the doc for 0.2.13 version. --- Changelog.txt | 21 +++++++++++++++++++++ Makefile.rule | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index e9fe824ca..b11321f71 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,25 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.13 +3-Dec-2014 +common: + * Add SYMBOLPREFIX and SYMBOLSUFFIX makefile options + for adding a prefix or suffix to all exported symbol names + in the shared library.(#459, Thanks Tony Kelman) + * Provide OpenBLASConfig.cmake at installation. + * Fix Fortran compiler detection on FreeBSD. + (#470, Thanks Mike Nolta) + + +x86/x86-64: + * Add generic kernel files for x86-64. make TARGET=GENERIC + * Fix a bug of sgemm kernel on Intel Sandy Bridge. + * Fix c_check bug on some amd64 systems. (#471, Thanks Mike Nolta) + +ARM: + * Support APM's X-Gene 1 AArch64 processors. + Optimize trmm and sgemm. (#465, Thanks Dave Nuechterlein) + ==================================================================== Version 0.2.12 13-Oct-2014 diff --git a/Makefile.rule b/Makefile.rule index 7f0356fff..d3a2d1fa3 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.12 +VERSION = 0.2.13 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library