diff --git a/Makefile b/Makefile index 343bd72f4..d68e0882b 100644 --- a/Makefile +++ b/Makefile @@ -262,6 +262,7 @@ endif lapack-test : + (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) make -j 1 -C $(NETLIB_LAPACK_DIR) tmglib make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) @@ -291,4 +292,6 @@ endif @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h @rm -f *.grd Makefile.conf_last config_last.h + @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) + @rm -f $(NETLIB_LAPACK_DIR)/tmglib.a @echo Done. diff --git a/interface/Makefile b/interface/Makefile index fcff93ec9..172e2e5bf 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -350,7 +350,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) SLAPACKOBJS = \ sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ - slauum.$(SUFFIX) strti2.$(SUFFIX) + slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) #DLAPACKOBJS = \ @@ -361,7 +361,7 @@ SLAPACKOBJS = \ DLAPACKOBJS = \ dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ - dlauum.$(SUFFIX) dtrti2.$(SUFFIX) + dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) QLAPACKOBJS = \ @@ -377,7 +377,7 @@ QLAPACKOBJS = \ CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ - clauum.$(SUFFIX) ctrti2.$(SUFFIX) + clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) #ZLAPACKOBJS = \ @@ -388,7 +388,7 @@ CLAPACKOBJS = \ ZLAPACKOBJS = \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ - zlauum.$(SUFFIX) ztrti2.$(SUFFIX) + zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) @@ -1883,19 +1883,19 @@ ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : lapack/ztrti2.c xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c $(CC) -c $(CFLAGS) $< -o $(@F) -strtri.$(SUFFIX) strtri.$(PSUFFIX) : trtri.c +strtri.$(SUFFIX) strtri.$(PSUFFIX) : lapack/trtri.c $(CC) -c $(CFLAGS) $< -o $(@F) -dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : trtri.c +dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : lapack/trtri.c $(CC) -c $(CFLAGS) $< -o $(@F) qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c $(CC) -c $(CFLAGS) $< -o $(@F) -ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : ztrtri.c +ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : lapack/ztrtri.c $(CC) -c $(CFLAGS) $< -o $(@F) -ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : ztrtri.c +ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : lapack/ztrtri.c $(CC) -c $(CFLAGS) $< -o $(@F) xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c diff --git a/interface/lapack/trtri.c.bad b/interface/lapack/trtri.c similarity index 100% rename from interface/lapack/trtri.c.bad rename to interface/lapack/trtri.c diff --git a/interface/lapack/ztrtri.c.bad b/interface/lapack/ztrtri.c similarity index 100% rename from interface/lapack/ztrtri.c.bad rename to interface/lapack/ztrtri.c diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 8a622c837..8b275db3f 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -147,7 +147,7 @@ SLASRC = \ stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ stptrs.o \ strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ - strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ + strtrs.o stzrqf.o stzrzf.o sstemr.o \ slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ @@ -225,7 +225,7 @@ CLASRC = \ ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ ctprfs.o ctptri.o \ ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ - ctrsyl.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ + ctrsyl.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ @@ -307,7 +307,7 @@ DLASRC = \ dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ dtptrs.o \ dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ - dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ + dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ @@ -387,7 +387,7 @@ ZLASRC = \ ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ ztprfs.o ztptri.o \ ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ - ztrsyl.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ + ztrsyl.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ diff --git a/lapack/Makefile b/lapack/Makefile index 870962bc9..aff5209d5 100644 --- a/lapack/Makefile +++ b/lapack/Makefile @@ -2,7 +2,7 @@ TOPDIR = .. include ../Makefile.system #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs -SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 +SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 diff --git a/lapack/trtri/trtri_L_single.c b/lapack/trtri/trtri_L_single.c index a940ce2f6..3e4343060 100644 --- a/lapack/trtri/trtri_L_single.c +++ b/lapack/trtri/trtri_L_single.c @@ -1,190 +1,113 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ +/*************************************************************************** + * Copyright (c) 2013, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +/************************************************************************************** +* 2014/05/22 Saar +* TEST double precision unblocked : OK +* 2014/05/23 Saar +* TEST double precision blocked: OK +* TEST single precision blocked: OK +**************************************************************************************/ #include #include "common.h" -static FLOAT dp1 = 1.; -static FLOAT dm1 = -1.; +// static FLOAT dp1 = 1.; +// static FLOAT dm1 = -1.; + #ifdef UNIT -#define TRTI2 TRTI2_LU +#define TRTI2 TRTI2_LU +#define TRMM TRMM_LNLU +#define TRSM TRSM_RNLU #else -#define TRTI2 TRTI2_LN +#define TRTI2 TRTI2_LN +#define TRMM TRMM_LNLN +#define TRSM TRSM_RNLN #endif -#if 0 -#undef GEMM_P -#undef GEMM_Q -#undef GEMM_R - -#define GEMM_P 8 -#define GEMM_Q 20 -#define GEMM_R 64 -#endif - -#define GEMM_PQ MAX(GEMM_P, GEMM_Q) -#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { - BLASLONG n, lda; + BLASLONG j, n, lda; FLOAT *a; - BLASLONG i, is, min_i, start_i; - BLASLONG ls, min_l; - BLASLONG bk; - BLASLONG blocking; - BLASLONG range_N[2]; + // BLASLONG info=0; + BLASLONG jb; + BLASLONG NB; + BLASLONG start_j; - FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); - FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb - + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) - + GEMM_OFFSET_A); - FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm - + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) - + GEMM_OFFSET_B); + FLOAT beta_plus[2] = { ONE, ZERO}; + FLOAT beta_minus[2] = {-ONE, ZERO}; n = args -> n; - a = (FLOAT *)args -> a; - lda = args -> lda; - if (range_n) { - n = range_n[1] - range_n[0]; - a += range_n[0] * (lda + 1) * COMPSIZE; - } + NB = GEMM_Q; - if (n <= DTB_ENTRIES) { + if (n < NB) { TRTI2(args, NULL, range_n, sa, sb, 0); return 0; } - blocking = GEMM_Q; - if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; - start_i = 0; - while (start_i < n) start_i += blocking; - start_i -= blocking; + lda = args -> lda; + a = (FLOAT *) args -> a; + args -> ldb = lda; + args -> ldc = lda; + args -> alpha = NULL; - for (i = start_i; i >= 0; i -= blocking) { - bk = MIN(blocking, n - i); - - if (n - bk - i > 0) TRSM_OLNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); - - if (!range_n) { - range_N[0] = i; - range_N[1] = i + bk; - } else { - range_N[0] = range_n[0] + i; - range_N[1] = range_n[0] + i + bk; - } + start_j = 0; + while (start_j < n) start_j += NB; + start_j -= NB; - CNAME(args, NULL, range_N, sa, sa_trmm, 0); - if (i > 0) { - TRMM_ILTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); + for (j = start_j ; j >=0 ; j-= NB) + { + jb = n - j; + if ( jb > NB ) jb = NB; - for (ls = 0; ls < i; ls += REAL_GEMM_R) { - min_l = i - ls; - if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; - - GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); - - if (n - bk - i > 0) { - for (is = i + bk; is < n; is += GEMM_P) { - min_i = n - is; - if (min_i > GEMM_P) min_i = GEMM_P; - - if (ls == 0) { - NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + args -> n = jb; + args -> m = n-j-jb; - TRSM_KERNEL_RT(min_i, bk, bk, dm1, -#ifdef COMPLEX - ZERO, -#endif - sa, sa_trsm, - a + (is + i * lda) * COMPSIZE, lda, 0); - } else { - GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); - } + args -> a = &a[(j+jb+(j+jb)*lda) * COMPSIZE]; + args -> b = &a[(j+jb+j*lda) * COMPSIZE]; + args -> beta = beta_plus; - GEMM_KERNEL_N(min_i, min_l, bk, dp1, -#ifdef COMPLEX - ZERO, -#endif - sa, sb_gemm, - a + (is + ls * lda) * COMPSIZE, lda); - } - } - - for (is = 0; is < bk; is += GEMM_P) { - min_i = bk - is; - if (min_i > GEMM_P) min_i = GEMM_P; - - TRMM_KERNEL_LT(min_i, min_l, bk, dp1, -#ifdef COMPLEX - ZERO, -#endif - sa_trmm + is * bk * COMPSIZE, sb_gemm, - a + (i + is + ls * lda) * COMPSIZE, lda, is); - } - } + TRMM(args, NULL, NULL, sa, sb, 0); - } else { + args -> a = &a[(j+j*lda) * COMPSIZE]; + args -> beta = beta_minus; + + TRSM(args, NULL, NULL, sa, sb, 0); + + args -> a = &a[(j+j*lda) * COMPSIZE]; + + TRTI2(args, NULL, range_n, sa, sb, 0); - if (n - bk - i > 0) { - for (is = 0; is < n - bk - i; is += GEMM_P) { - min_i = n - bk - i - is; - if (min_i > GEMM_P) min_i = GEMM_P; - - NEG_TCOPY (bk, min_i, a + (i + bk + is + i * lda) * COMPSIZE, lda, sa); - - TRSM_KERNEL_RT(min_i, bk, bk, dm1, -#ifdef COMPLEX - ZERO, -#endif - sa, sa_trsm, - a + (i + bk + is + i * lda) * COMPSIZE, lda, 0); - } - } - - } } - return 0; } diff --git a/lapack/trtri/trtri_U_single.c b/lapack/trtri/trtri_U_single.c index c79281cfb..e4da5da59 100644 --- a/lapack/trtri/trtri_U_single.c +++ b/lapack/trtri/trtri_U_single.c @@ -1,46 +1,44 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ +/*************************************************************************** + * Copyright (c) 2013, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +/************************************************************************************** +* 2014/05/22 Saar +* TEST double precision unblocked : OK +* TEST double precision blocked : OK +* 2014/05/23 +* TEST single precision blocked : OK +* +**************************************************************************************/ #include #include "common.h" -static FLOAT dp1 = 1.; -static FLOAT dm1 = -1.; +// static FLOAT dp1 = 1.; +// static FLOAT dm1 = -1.; #ifdef UNIT #define TRTI2 TRTI2_UU @@ -48,152 +46,66 @@ static FLOAT dm1 = -1.; #define TRTI2 TRTI2_UN #endif -#if 0 -#undef GEMM_P -#undef GEMM_Q -#undef GEMM_R - -#define GEMM_P 8 -#define GEMM_Q 20 -#define GEMM_R 64 +#ifdef UNIT +#define TRMM TRMM_LNUU +#define TRSM TRSM_RNUU +#else +#define TRMM TRMM_LNUN +#define TRSM TRSM_RNUN #endif -#define GEMM_PQ MAX(GEMM_P, GEMM_Q) -#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { - BLASLONG n, lda; + BLASLONG j, n, lda; FLOAT *a; - BLASLONG i, is, min_i, start_is; - BLASLONG ls, min_l; - BLASLONG bk; - BLASLONG blocking; - BLASLONG range_N[2]; + // BLASLONG info=0; + BLASLONG jb; + BLASLONG NB; - FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); - FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb - + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) - + GEMM_OFFSET_A); - FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm - + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) - + GEMM_OFFSET_B); + FLOAT beta_plus[2] = { ONE, ZERO}; + FLOAT beta_minus[2] = {-ONE, ZERO}; n = args -> n; - a = (FLOAT *)args -> a; - lda = args -> lda; - if (range_n) { - n = range_n[1] - range_n[0]; - a += range_n[0] * (lda + 1) * COMPSIZE; - } + NB = GEMM_Q; - if (n <= DTB_ENTRIES) { + if (n <= NB) { TRTI2(args, NULL, range_n, sa, sb, 0); return 0; } - blocking = GEMM_Q; - if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; - for (i = 0; i < n; i += blocking) { - bk = MIN(blocking, n - i); - - if (i > 0) TRSM_OUNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); + lda = args -> lda; + a = (FLOAT *) args -> a; + args -> ldb = lda; + args -> ldc = lda; + args -> alpha = NULL; - if (!range_n) { - range_N[0] = i; - range_N[1] = i + bk; - } else { - range_N[0] = range_n[0] + i; - range_N[1] = range_n[0] + i + bk; - } + for (j = 0; j < n; j += NB) + { + jb = n - j; + if ( jb > NB ) jb = NB; - CNAME(args, NULL, range_N, sa, sa_trmm, 0); + args -> n = jb; + args -> m = j; - if (n -bk - i > 0) { - TRMM_IUTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); + args -> a = &a[0]; + args -> b = &a[(j*lda) * COMPSIZE]; + args -> beta = beta_plus; - for (ls = i + bk; ls < n; ls += REAL_GEMM_R) { - min_l = n - ls; - if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; - - GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); - - if (i > 0) { - for (is = 0; is < i; is += GEMM_P) { - min_i = i - is; - if (min_i > GEMM_P) min_i = GEMM_P; - - if (ls == i + bk) { - //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + TRMM(args, NULL, NULL, sa, sb, 0); - GEMM_BETA(min_i, bk, 0, dm1, -#ifdef COMPLEX - ZERO, -#endif - NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); + args -> a = &a[(j+j*lda) * COMPSIZE]; + args -> beta = beta_minus; - TRSM_KERNEL_RN(min_i, bk, bk, dm1, -#ifdef COMPLEX - ZERO, -#endif - sa, sa_trsm, - a + (is + i * lda) * COMPSIZE, lda, 0); - } else { - GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); - } - - GEMM_KERNEL_N(min_i, min_l, bk, dp1, -#ifdef COMPLEX - ZERO, -#endif - sa, sb_gemm, - a + (is + ls * lda) * COMPSIZE, lda); - } - } - - start_is = 0; - while (start_is < bk) start_is += GEMM_P; - start_is -= GEMM_P; + TRSM(args, NULL, NULL, sa, sb, 0); - for (is = 0; is < bk; is += GEMM_P) { - min_i = bk - is; - if (min_i > GEMM_P) min_i = GEMM_P; - - TRMM_KERNEL_LN(min_i, min_l, bk, dp1, -#ifdef COMPLEX - ZERO, -#endif - sa_trmm + is * bk * COMPSIZE, sb_gemm, - a + (i + is + ls * lda) * COMPSIZE, lda, is); - } - } + args -> a = &a[(j+j*lda) * COMPSIZE]; - } else { - if (i > 0) { - for (is = 0; is < i; is += GEMM_P) { - min_i = i - is; - if (min_i > GEMM_P) min_i = GEMM_P; - - //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); - GEMM_BETA(min_i, bk, 0, dm1, -#ifdef COMPLEX - ZERO, -#endif - NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); + TRTI2(args, NULL, range_n, sa, sb, 0); - TRSM_KERNEL_RN(min_i, bk, bk, dm1, -#ifdef COMPLEX - ZERO, -#endif - sa, sa_trsm, - a + (is + i * lda) * COMPSIZE, lda, 0); - } - } - } } - return 0; }