Complete all the plura single precision functions of level3 on Loongson3a, the performance is 2.3GFlops.

This commit is contained in:
traz 2011-07-18 17:03:38 +00:00
parent 68532fa9ec
commit c8360e3ae5
7 changed files with 1499 additions and 9 deletions

View File

@ -123,10 +123,21 @@ ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S
endif
ifndef CTRSMKERNEL_LN
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_LT
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RN
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RT
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef ZTRSMKERNEL_LN
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S

View File

@ -1,19 +1,25 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S
SGEMMKERNEL = sgemm_kernel_loongson3a.S
SGEMMKERNEL = sgemm_kernel_loongson3a_4x4.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = gemm_kernel_loongson3a.S
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_loongson3a.S
CGEMMKERNEL = cgemm_kernel_loongson3a_2x2.S
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
@ -29,6 +35,11 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c

File diff suppressed because it is too large Load Diff

View File

@ -1065,8 +1065,8 @@
daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx
MADD2 c22, c22, a4, b1
MADD4 c24, c24, a4, b2
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4
gsLQC1(R12, F3, F2, 1) # R:a3 I:a4
gsLQC1(R13, F5, F4, 0) # R:b1 I:b2

10
param.h
View File

@ -1486,25 +1486,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 1
#define CGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_P 32
#define CGEMM_DEFAULT_P 108
#define CGEMM_DEFAULT_P 64
#define ZGEMM_DEFAULT_P 32
#define SGEMM_DEFAULT_Q 116
#define DGEMM_DEFAULT_Q 116
#define CGEMM_DEFAULT_Q 144
#define CGEMM_DEFAULT_Q 100
#define ZGEMM_DEFAULT_Q 80
#define SGEMM_DEFAULT_R 1000
#define DGEMM_DEFAULT_R 1000
#define CGEMM_DEFAULT_R 2000
#define CGEMM_DEFAULT_R 1000
#define ZGEMM_DEFAULT_R 1000
#define SYMV_P 16