enabled abd tested optimized trtri lapack functions

This commit is contained in:
wernsaar 2014-05-23 10:55:39 +02:00
parent c4ccb3fbb2
commit c26bbee489
8 changed files with 165 additions and 327 deletions

View File

@ -262,6 +262,7 @@ endif
lapack-test : lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR) tmglib make -j 1 -C $(NETLIB_LAPACK_DIR) tmglib
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
@ -291,4 +292,6 @@ endif
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@rm -f *.grd Makefile.conf_last config_last.h @rm -f *.grd Makefile.conf_last config_last.h
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
@rm -f $(NETLIB_LAPACK_DIR)/tmglib.a
@echo Done. @echo Done.

View File

@ -350,7 +350,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
SLAPACKOBJS = \ SLAPACKOBJS = \
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
slauum.$(SUFFIX) strti2.$(SUFFIX) slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX)
#DLAPACKOBJS = \ #DLAPACKOBJS = \
@ -361,7 +361,7 @@ SLAPACKOBJS = \
DLAPACKOBJS = \ DLAPACKOBJS = \
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX)
QLAPACKOBJS = \ QLAPACKOBJS = \
@ -377,7 +377,7 @@ QLAPACKOBJS = \
CLAPACKOBJS = \ CLAPACKOBJS = \
cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
clauum.$(SUFFIX) ctrti2.$(SUFFIX) clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX)
#ZLAPACKOBJS = \ #ZLAPACKOBJS = \
@ -388,7 +388,7 @@ CLAPACKOBJS = \
ZLAPACKOBJS = \ ZLAPACKOBJS = \
zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX)
@ -1883,19 +1883,19 @@ ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : lapack/ztrti2.c
xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
strtri.$(SUFFIX) strtri.$(PSUFFIX) : trtri.c strtri.$(SUFFIX) strtri.$(PSUFFIX) : lapack/trtri.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : trtri.c dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : lapack/trtri.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : ztrtri.c ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : lapack/ztrtri.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : ztrtri.c ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : lapack/ztrtri.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c

View File

@ -147,7 +147,7 @@ SLASRC = \
stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \
stptrs.o \ stptrs.o \
strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \
strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ strtrs.o stzrqf.o stzrzf.o sstemr.o \
slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \
stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ stfttr.o stpttf.o stpttr.o strttf.o strttp.o \
sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \
@ -225,7 +225,7 @@ CLASRC = \
ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \
ctprfs.o ctptri.o \ ctprfs.o ctptri.o \
ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \
ctrsyl.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ ctrsyl.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \
cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \
cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \
cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \
@ -307,7 +307,7 @@ DLASRC = \
dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \
dtptrs.o \ dtptrs.o \
dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \
dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \
dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \
dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \
dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \
@ -387,7 +387,7 @@ ZLASRC = \
ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \
ztprfs.o ztptri.o \ ztprfs.o ztptri.o \
ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \
ztrsyl.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ ztrsyl.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \
zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \
zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \
zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \

View File

@ -2,7 +2,7 @@ TOPDIR = ..
include ../Makefile.system include ../Makefile.system
#SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs
SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri
FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 FLAMEDIRS = laswp getf2 potf2 lauu2 trti2

View File

@ -1,190 +1,113 @@
/*********************************************************************/ /***************************************************************************
/* Copyright 2009, 2010 The University of Texas at Austin. */ * Copyright (c) 2013, The OpenBLAS Project
/* All rights reserved. */ * All rights reserved.
/* */ * Redistribution and use in source and binary forms, with or without
/* Redistribution and use in source and binary forms, with or */ * modification, are permitted provided that the following conditions are
/* without modification, are permitted provided that the following */ * met:
/* conditions are met: */ * 1. Redistributions of source code must retain the above copyright
/* */ * notice, this list of conditions and the following disclaimer.
/* 1. Redistributions of source code must retain the above */ * 2. Redistributions in binary form must reproduce the above copyright
/* copyright notice, this list of conditions and the following */ * notice, this list of conditions and the following disclaimer in
/* disclaimer. */ * the documentation and/or other materials provided with the
/* */ * distribution.
/* 2. Redistributions in binary form must reproduce the above */ * 3. Neither the name of the OpenBLAS project nor the names of
/* copyright notice, this list of conditions and the following */ * its contributors may be used to endorse or promote products
/* disclaimer in the documentation and/or other materials */ * derived from this software without specific prior written permission.
/* provided with the distribution. */ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/* */ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ * *****************************************************************************/
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /**************************************************************************************
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ * 2014/05/22 Saar
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ * TEST double precision unblocked : OK
/* POSSIBILITY OF SUCH DAMAGE. */ * 2014/05/23 Saar
/* */ * TEST double precision blocked: OK
/* The views and conclusions contained in the software and */ * TEST single precision blocked: OK
/* documentation are those of the authors and should not be */ **************************************************************************************/
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h> #include <stdio.h>
#include "common.h" #include "common.h"
static FLOAT dp1 = 1.; // static FLOAT dp1 = 1.;
static FLOAT dm1 = -1.; // static FLOAT dm1 = -1.;
#ifdef UNIT #ifdef UNIT
#define TRTI2 TRTI2_LU #define TRTI2 TRTI2_LU
#define TRMM TRMM_LNLU
#define TRSM TRSM_RNLU
#else #else
#define TRTI2 TRTI2_LN #define TRTI2 TRTI2_LN
#define TRMM TRMM_LNLN
#define TRSM TRSM_RNLN
#endif #endif
#if 0
#undef GEMM_P
#undef GEMM_Q
#undef GEMM_R
#define GEMM_P 8
#define GEMM_Q 20
#define GEMM_R 64
#endif
#define GEMM_PQ MAX(GEMM_P, GEMM_Q)
#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ)
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
BLASLONG n, lda; BLASLONG j, n, lda;
FLOAT *a; FLOAT *a;
BLASLONG i, is, min_i, start_i; // BLASLONG info=0;
BLASLONG ls, min_l; BLASLONG jb;
BLASLONG bk; BLASLONG NB;
BLASLONG blocking; BLASLONG start_j;
BLASLONG range_N[2];
FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); FLOAT beta_plus[2] = { ONE, ZERO};
FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb FLOAT beta_minus[2] = {-ONE, ZERO};
+ GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
+ GEMM_OFFSET_A);
FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm
+ GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
+ GEMM_OFFSET_B);
n = args -> n; n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
if (range_n) { NB = GEMM_Q;
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
}
if (n <= DTB_ENTRIES) { if (n < NB) {
TRTI2(args, NULL, range_n, sa, sb, 0); TRTI2(args, NULL, range_n, sa, sb, 0);
return 0; return 0;
} }
blocking = GEMM_Q;
if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4;
start_i = 0; lda = args -> lda;
while (start_i < n) start_i += blocking; a = (FLOAT *) args -> a;
start_i -= blocking; args -> ldb = lda;
args -> ldc = lda;
args -> alpha = NULL;
for (i = start_i; i >= 0; i -= blocking) { start_j = 0;
bk = MIN(blocking, n - i); while (start_j < n) start_j += NB;
start_j -= NB;
if (n - bk - i > 0) TRSM_OLNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm);
if (!range_n) { for (j = start_j ; j >=0 ; j-= NB)
range_N[0] = i; {
range_N[1] = i + bk; jb = n - j;
} else { if ( jb > NB ) jb = NB;
range_N[0] = range_n[0] + i;
range_N[1] = range_n[0] + i + bk;
}
CNAME(args, NULL, range_N, sa, sa_trmm, 0); args -> n = jb;
args -> m = n-j-jb;
if (i > 0) { args -> a = &a[(j+jb+(j+jb)*lda) * COMPSIZE];
TRMM_ILTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); args -> b = &a[(j+jb+j*lda) * COMPSIZE];
args -> beta = beta_plus;
for (ls = 0; ls < i; ls += REAL_GEMM_R) { TRMM(args, NULL, NULL, sa, sb, 0);
min_l = i - ls;
if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); args -> a = &a[(j+j*lda) * COMPSIZE];
args -> beta = beta_minus;
if (n - bk - i > 0) { TRSM(args, NULL, NULL, sa, sb, 0);
for (is = i + bk; is < n; is += GEMM_P) {
min_i = n - is;
if (min_i > GEMM_P) min_i = GEMM_P;
if (ls == 0) { args -> a = &a[(j+j*lda) * COMPSIZE];
NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
TRSM_KERNEL_RT(min_i, bk, bk, dm1, TRTI2(args, NULL, range_n, sa, sb, 0);
#ifdef COMPLEX
ZERO,
#endif
sa, sa_trsm,
a + (is + i * lda) * COMPSIZE, lda, 0);
} else {
GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
}
GEMM_KERNEL_N(min_i, min_l, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa, sb_gemm,
a + (is + ls * lda) * COMPSIZE, lda);
}
}
for (is = 0; is < bk; is += GEMM_P) {
min_i = bk - is;
if (min_i > GEMM_P) min_i = GEMM_P;
TRMM_KERNEL_LT(min_i, min_l, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa_trmm + is * bk * COMPSIZE, sb_gemm,
a + (i + is + ls * lda) * COMPSIZE, lda, is);
}
}
} else {
if (n - bk - i > 0) {
for (is = 0; is < n - bk - i; is += GEMM_P) {
min_i = n - bk - i - is;
if (min_i > GEMM_P) min_i = GEMM_P;
NEG_TCOPY (bk, min_i, a + (i + bk + is + i * lda) * COMPSIZE, lda, sa);
TRSM_KERNEL_RT(min_i, bk, bk, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa, sa_trsm,
a + (i + bk + is + i * lda) * COMPSIZE, lda, 0);
}
}
}
} }
return 0; return 0;
} }

View File

@ -1,46 +1,44 @@
/*********************************************************************/ /***************************************************************************
/* Copyright 2009, 2010 The University of Texas at Austin. */ * Copyright (c) 2013, The OpenBLAS Project
/* All rights reserved. */ * All rights reserved.
/* */ * Redistribution and use in source and binary forms, with or without
/* Redistribution and use in source and binary forms, with or */ * modification, are permitted provided that the following conditions are
/* without modification, are permitted provided that the following */ * met:
/* conditions are met: */ * 1. Redistributions of source code must retain the above copyright
/* */ * notice, this list of conditions and the following disclaimer.
/* 1. Redistributions of source code must retain the above */ * 2. Redistributions in binary form must reproduce the above copyright
/* copyright notice, this list of conditions and the following */ * notice, this list of conditions and the following disclaimer in
/* disclaimer. */ * the documentation and/or other materials provided with the
/* */ * distribution.
/* 2. Redistributions in binary form must reproduce the above */ * 3. Neither the name of the OpenBLAS project nor the names of
/* copyright notice, this list of conditions and the following */ * its contributors may be used to endorse or promote products
/* disclaimer in the documentation and/or other materials */ * derived from this software without specific prior written permission.
/* provided with the distribution. */ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/* */ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ * *****************************************************************************/
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /**************************************************************************************
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ * 2014/05/22 Saar
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ * TEST double precision unblocked : OK
/* POSSIBILITY OF SUCH DAMAGE. */ * TEST double precision blocked : OK
/* */ * 2014/05/23
/* The views and conclusions contained in the software and */ * TEST single precision blocked : OK
/* documentation are those of the authors and should not be */ *
/* interpreted as representing official policies, either expressed */ **************************************************************************************/
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h> #include <stdio.h>
#include "common.h" #include "common.h"
static FLOAT dp1 = 1.; // static FLOAT dp1 = 1.;
static FLOAT dm1 = -1.; // static FLOAT dm1 = -1.;
#ifdef UNIT #ifdef UNIT
#define TRTI2 TRTI2_UU #define TRTI2 TRTI2_UU
@ -48,152 +46,66 @@ static FLOAT dm1 = -1.;
#define TRTI2 TRTI2_UN #define TRTI2 TRTI2_UN
#endif #endif
#if 0 #ifdef UNIT
#undef GEMM_P #define TRMM TRMM_LNUU
#undef GEMM_Q #define TRSM TRSM_RNUU
#undef GEMM_R #else
#define TRMM TRMM_LNUN
#define GEMM_P 8 #define TRSM TRSM_RNUN
#define GEMM_Q 20
#define GEMM_R 64
#endif #endif
#define GEMM_PQ MAX(GEMM_P, GEMM_Q)
#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ)
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
BLASLONG n, lda; BLASLONG j, n, lda;
FLOAT *a; FLOAT *a;
BLASLONG i, is, min_i, start_is; // BLASLONG info=0;
BLASLONG ls, min_l; BLASLONG jb;
BLASLONG bk; BLASLONG NB;
BLASLONG blocking;
BLASLONG range_N[2];
FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); FLOAT beta_plus[2] = { ONE, ZERO};
FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb FLOAT beta_minus[2] = {-ONE, ZERO};
+ GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
+ GEMM_OFFSET_A);
FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm
+ GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
+ GEMM_OFFSET_B);
n = args -> n; n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
if (range_n) { NB = GEMM_Q;
n = range_n[1] - range_n[0];
a += range_n[0] * (lda + 1) * COMPSIZE;
}
if (n <= DTB_ENTRIES) { if (n <= NB) {
TRTI2(args, NULL, range_n, sa, sb, 0); TRTI2(args, NULL, range_n, sa, sb, 0);
return 0; return 0;
} }
blocking = GEMM_Q;
if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4;
for (i = 0; i < n; i += blocking) { lda = args -> lda;
bk = MIN(blocking, n - i); a = (FLOAT *) args -> a;
args -> ldb = lda;
args -> ldc = lda;
args -> alpha = NULL;
if (i > 0) TRSM_OUNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); for (j = 0; j < n; j += NB)
{
jb = n - j;
if ( jb > NB ) jb = NB;
if (!range_n) { args -> n = jb;
range_N[0] = i; args -> m = j;
range_N[1] = i + bk;
} else {
range_N[0] = range_n[0] + i;
range_N[1] = range_n[0] + i + bk;
}
CNAME(args, NULL, range_N, sa, sa_trmm, 0); args -> a = &a[0];
args -> b = &a[(j*lda) * COMPSIZE];
args -> beta = beta_plus;
if (n -bk - i > 0) { TRMM(args, NULL, NULL, sa, sb, 0);
TRMM_IUTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm);
for (ls = i + bk; ls < n; ls += REAL_GEMM_R) { args -> a = &a[(j+j*lda) * COMPSIZE];
min_l = n - ls; args -> beta = beta_minus;
if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R;
GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); TRSM(args, NULL, NULL, sa, sb, 0);
if (i > 0) { args -> a = &a[(j+j*lda) * COMPSIZE];
for (is = 0; is < i; is += GEMM_P) {
min_i = i - is;
if (min_i > GEMM_P) min_i = GEMM_P;
if (ls == i + bk) { TRTI2(args, NULL, range_n, sa, sb, 0);
//NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
GEMM_BETA(min_i, bk, 0, dm1,
#ifdef COMPLEX
ZERO,
#endif
NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda);
TRSM_KERNEL_RN(min_i, bk, bk, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa, sa_trsm,
a + (is + i * lda) * COMPSIZE, lda, 0);
} else {
GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
}
GEMM_KERNEL_N(min_i, min_l, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa, sb_gemm,
a + (is + ls * lda) * COMPSIZE, lda);
}
}
start_is = 0;
while (start_is < bk) start_is += GEMM_P;
start_is -= GEMM_P;
for (is = 0; is < bk; is += GEMM_P) {
min_i = bk - is;
if (min_i > GEMM_P) min_i = GEMM_P;
TRMM_KERNEL_LN(min_i, min_l, bk, dp1,
#ifdef COMPLEX
ZERO,
#endif
sa_trmm + is * bk * COMPSIZE, sb_gemm,
a + (i + is + ls * lda) * COMPSIZE, lda, is);
}
}
} else {
if (i > 0) {
for (is = 0; is < i; is += GEMM_P) {
min_i = i - is;
if (min_i > GEMM_P) min_i = GEMM_P;
//NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa);
GEMM_BETA(min_i, bk, 0, dm1,
#ifdef COMPLEX
ZERO,
#endif
NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda);
TRSM_KERNEL_RN(min_i, bk, bk, dm1,
#ifdef COMPLEX
ZERO,
#endif
sa, sa_trsm,
a + (is + i * lda) * COMPSIZE, lda, 0);
}
}
}
} }
return 0; return 0;
} }