From f76f9525477785bb452699c07d1985ec14dc2b61 Mon Sep 17 00:00:00 2001 From: wangqian Date: Tue, 19 Jun 2012 16:17:43 +0800 Subject: [PATCH] Refs #83 #53. Adding Intel Sandy Bridge (AVX supported) kernel codes for BLAS level 3 functions. --- kernel/generic/zgemm_ncopy_4_sandy.c | 235 ++ kernel/generic/zgemm_ncopy_8_sandy.c | 401 +++ kernel/generic/zgemm_tcopy_4_sandy.c | 237 ++ kernel/generic/zgemm_tcopy_8_sandy.c | 370 ++ kernel/x86_64/KERNEL.SANDYBRIDGE | 97 +- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 4478 ++++++++++++++++++++++++ kernel/x86_64/dgemm_kernel_4x8_sandy.S | 3186 +++++++++++++++++ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 3736 ++++++++++++++++++++ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 3257 +++++++++++++++++ param.h | 100 +- 10 files changed, 15982 insertions(+), 115 deletions(-) create mode 100644 kernel/generic/zgemm_ncopy_4_sandy.c create mode 100644 kernel/generic/zgemm_ncopy_8_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_4_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_8_sandy.c create mode 100644 kernel/x86_64/cgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/dgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/sgemm_kernel_8x8_sandy.S create mode 100644 kernel/x86_64/zgemm_kernel_4x4_sandy.S diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c new file mode 100644 index 000000000..839bd5939 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_4_sandy.c @@ -0,0 +1,235 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2,*dest4; + ii = col&-8; + ii = ii*(2*row); + dest4 = dest+ii; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j