From 6c4a7d0828c708b32e4f989662fd9ce4cbfd4c31 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 25 Aug 2013 10:16:01 -0300 Subject: [PATCH 01/15] Import AMD Piledriver DGEMM kernel generated by AUGEM. So far, this kernel doesn't deal with edge. AUGEM: Automatically Generate High Performance Dense Linear Algebra Kernels on x86 CPUs. Qian Wang, Xianyi Zhang, Yunquan Zhang, and Qing Yi. In the International Conference for High Performance Computing, Networking, Storage and Analysis (SC'13). Denver, CO. Nov, 2013. --- kernel/generic/gemm_ncopy_6.c | 230 +++ kernel/generic/gemm_tcopy_6.c | 281 +++ kernel/generic/symm_lcopy_6.c | 138 ++ kernel/generic/symm_ucopy_6.c | 136 ++ kernel/generic/trmm_lncopy_6.c | 484 ++++++ kernel/generic/trmm_ltcopy_6.c | 488 ++++++ kernel/generic/trmm_uncopy_6.c | 785 +++++++++ kernel/generic/trmm_utcopy_6.c | 472 +++++ kernel/generic/trsm_kernel_LN.c | 4 + kernel/generic/trsm_kernel_LT.c | 4 + kernel/generic/trsm_kernel_RN.c | 4 + kernel/generic/trsm_kernel_RT.c | 5 + kernel/generic/trsm_lncopy_6.c | 326 ++++ kernel/generic/trsm_ltcopy_6.c | 346 ++++ kernel/generic/trsm_uncopy_6.c | 350 ++++ kernel/generic/trsm_utcopy_6.c | 322 ++++ kernel/x86_64/KERNEL.PILEDRIVER | 12 +- kernel/x86_64/dgemm_kernel_6x4_piledriver.S | 1734 +++++++++++++++++++ param.h | 10 +- 19 files changed, 6121 insertions(+), 10 deletions(-) create mode 100644 kernel/generic/gemm_ncopy_6.c create mode 100644 kernel/generic/gemm_tcopy_6.c create mode 100644 kernel/generic/symm_lcopy_6.c create mode 100644 kernel/generic/symm_ucopy_6.c create mode 100644 kernel/generic/trmm_lncopy_6.c create mode 100644 kernel/generic/trmm_ltcopy_6.c create mode 100644 kernel/generic/trmm_uncopy_6.c create mode 100644 kernel/generic/trmm_utcopy_6.c create mode 100644 kernel/generic/trsm_lncopy_6.c create mode 100644 kernel/generic/trsm_ltcopy_6.c create mode 100644 kernel/generic/trsm_uncopy_6.c create mode 100644 kernel/generic/trsm_utcopy_6.c create mode 100644 kernel/x86_64/dgemm_kernel_6x4_piledriver.S diff --git a/kernel/generic/gemm_ncopy_6.c b/kernel/generic/gemm_ncopy_6.c new file mode 100644 index 000000000..1ecb93c65 --- /dev/null +++ b/kernel/generic/gemm_ncopy_6.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + *(b_offset + 4) = ctemp2; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp10; + *(b_offset + 7) = ctemp14; + + *(b_offset + 8) = ctemp3; + *(b_offset + 9) = ctemp7; + *(b_offset + 10) = ctemp11; + *(b_offset + 11) = ctemp15; + + *(b_offset + 12) = ctemp4; + *(b_offset + 13) = ctemp8; + *(b_offset + 14) = ctemp12; + *(b_offset + 15) = ctemp16; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_6.c b/kernel/generic/gemm_tcopy_6.c new file mode 100644 index 000000000..bd32090e7 --- /dev/null +++ b/kernel/generic/gemm_tcopy_6.c @@ -0,0 +1,281 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + *(b_offset1 + 8) = ctemp9; + *(b_offset1 + 9) = ctemp10; + *(b_offset1 + 10) = ctemp11; + *(b_offset1 + 11) = ctemp12; + + *(b_offset1 + 12) = ctemp13; + *(b_offset1 + 13) = ctemp14; + *(b_offset1 + 14) = ctemp15; + *(b_offset1 + 15) = ctemp16; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset3 + 0); + ctemp6 = *(a_offset3 + 1); + + ctemp7 = *(a_offset4 + 0); + ctemp8 = *(a_offset4 + 1); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + *(b_offset2 + 4) = ctemp5; + *(b_offset2 + 5) = ctemp6; + *(b_offset2 + 6) = ctemp7; + *(b_offset2 + 7) = ctemp8; + + b_offset2 += 8; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + ctemp3 = *(a_offset3 + 0); + ctemp4 = *(a_offset4 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + *(b_offset3 + 2) = ctemp3; + *(b_offset3 + 3) = ctemp4; + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_6.c b/kernel/generic/symm_lcopy_6.c new file mode 100644 index 000000000..ac04943e2 --- /dev/null +++ b/kernel/generic/symm_lcopy_6.c @@ -0,0 +1,138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_6.c b/kernel/generic/symm_ucopy_6.c new file mode 100644 index 000000000..9b9cff820 --- /dev/null +++ b/kernel/generic/symm_ucopy_6.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_6.c b/kernel/generic/trmm_lncopy_6.c new file mode 100644 index 000000000..6cd16673a --- /dev/null +++ b/kernel/generic/trmm_lncopy_6.c @@ -0,0 +1,484 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = ONE; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += 1; + } else + if (X < posY) { + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_6.c b/kernel/generic/trmm_ltcopy_6.c new file mode 100644 index 000000000..69a233be6 --- /dev/null +++ b/kernel/generic/trmm_ltcopy_6.c @@ -0,0 +1,488 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data12; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data12 = *(ao3 + 3); + } + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data12; + b += 4; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 1; + ao1 += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += 1; + b += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_6.c b/kernel/generic/trmm_uncopy_6.c new file mode 100644 index 000000000..70945a246 --- /dev/null +++ b/kernel/generic/trmm_uncopy_6.c @@ -0,0 +1,785 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, mm; + + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data07, data08, data09, data10, data11, data12; + FLOAT data13, data14, data15, data16, data17, data18; + FLOAT data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data31, data32, data33, data34, data35, data36; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + //js = (n >> 2); + js = n/6; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } + + i = m/6; + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + + data07 = *(ao2 + 0); + data08 = *(ao2 + 1); + data09 = *(ao2 + 2); + data10 = *(ao2 + 3); + data11 = *(ao2 + 4); + data12 = *(ao2 + 5); + + data13 = *(ao3 + 0); + data14 = *(ao3 + 1); + data15 = *(ao3 + 2); + data16 = *(ao3 + 3); + data17 = *(ao3 + 4); + data18 = *(ao3 + 5); + + data19 = *(ao4 + 0); + data20 = *(ao4 + 1); + data21 = *(ao4 + 2); + data22 = *(ao4 + 3); + data23 = *(ao4 + 4); + data24 = *(ao4 + 5); + + data25 = *(ao5 + 0); + data26 = *(ao5 + 1); + data27 = *(ao5 + 2); + data28 = *(ao5 + 3); + data29 = *(ao5 + 4); + data30 = *(ao5 + 5); + + data31 = *(ao6 + 0); + data32 = *(ao6 + 1); + data33 = *(ao6 + 2); + data34 = *(ao6 + 3); + data35 = *(ao6 + 4); + data36 = *(ao6 + 5); + + b[ 0] = data01; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = data02; + b[ 7] = data08; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = data03; + b[13] = data09; + b[14] = data15; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = data04; + b[19] = data10; + b[20] = data16; + b[21] = data22; + b[22] = data28; + b[23] = data34; + + b[24] = data05; + b[25] = data11; + b[26] = data17; + b[27] = data23; + b[28] = data29; + b[29] = data35; + + b[30] = data06; + b[31] = data12; + b[32] = data18; + b[33] = data24; + b[34] = data30; + b[35] = data36; + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + } else + if (X > posY) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ZERO; + + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } else { + data01 = *(ao1 + 0); + data07 = *(ao2 + 0); + data13 = *(ao3 + 0); + data19 = *(ao4 + 0); + data25 = *(ao5 + 0); + data31 = *(ao6 + 0); + + data08 = *(ao2 + 1); + data14 = *(ao3 + 1); + data20 = *(ao4 + 1); + data26 = *(ao5 + 1); + data32 = *(ao6 + 1); + + data15 = *(ao3 + 2); + data21 = *(ao4 + 2); + data27 = *(ao5 + 2); + data33 = *(ao6 + 2); + + data22 = *(ao4 + 3); + data28 = *(ao5 + 3); + data34 = *(ao6 + 3); + + data29 = *(ao5 + 4); + data35 = *(ao6 + 4); + + data36 = *(ao6 + 5); + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = ZERO; + b[ 7] = ONE; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ONE; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ONE; + b[22] = data28; + b[23] = data34; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ONE; + b[29] = data35; + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ONE; +#else + b[ 0] = data01; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = ZERO; + b[ 7] = data08; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = data15; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = data22; + b[22] = data28; + b[23] = data34; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = data29; + b[29] = data35; + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = data36; +#endif + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 7; + + b += 36; + } + X += 6; + i --; + } while (i > 0); + } + mm = m - m/6; + if (mm & 4) { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + X += 4; + } + + if (mm & 3) { + if (X < posY) { + if (mm & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (mm & 1) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data05 = *(ao3 + 0); + data07 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + ao1 += lda; + ao2 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + ao1 += lda; + ao2 += lda; + b += 2; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_6.c b/kernel/generic/trmm_utcopy_6.c new file mode 100644 index 000000000..7d4dba34b --- /dev/null +++ b/kernel/generic/trmm_utcopy_6.c @@ -0,0 +1,472 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + if (i >= 2) { + data05 = *(ao2 + 0); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = ONE; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = ONE; + b[ 3] = ZERO; + b += 4; + } +#else + data01 = *(ao1 + 0); + + if (i >= 2) { + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + } + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = data11; + b[ 3] = ZERO; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; + +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + + b[ 0] = data01; + b[ 1] = ZERO; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_LN.c b/kernel/generic/trsm_kernel_LN.c index 068a202b8..931cba377 100644 --- a/kernel/generic/trsm_kernel_LN.c +++ b/kernel/generic/trsm_kernel_LN.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_LT.c b/kernel/generic/trsm_kernel_LT.c index 300fdd483..099624252 100644 --- a/kernel/generic/trsm_kernel_LT.c +++ b/kernel/generic/trsm_kernel_LT.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_RN.c b/kernel/generic/trsm_kernel_RN.c index b85c3c1e9..d7e650e0c 100644 --- a/kernel/generic/trsm_kernel_RN.c +++ b/kernel/generic/trsm_kernel_RN.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_RT.c b/kernel/generic/trsm_kernel_RT.c index 2adb3a4f7..a46945330 100644 --- a/kernel/generic/trsm_kernel_RT.c +++ b/kernel/generic/trsm_kernel_RT.c @@ -58,6 +58,11 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_lncopy_6.c b/kernel/generic/trsm_lncopy_6.c new file mode 100644 index 000000000..9f7bcc2dd --- /dev/null +++ b/kernel/generic/trsm_lncopy_6.c @@ -0,0 +1,326 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = INV(data11); + + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data05; + *(b + 3) = data07; + *(b + 4) = data02; + *(b + 5) = data04; + *(b + 6) = data06; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_6.c b/kernel/generic/trsm_ltcopy_6.c new file mode 100644 index 000000000..d891468a4 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_6.c @@ -0,0 +1,346 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 10) = INV(data11); + *(b + 11) = data12; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_6.c b/kernel/generic/trsm_uncopy_6.c new file mode 100644 index 000000000..837a25019 --- /dev/null +++ b/kernel/generic/trsm_uncopy_6.c @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 10) = INV(data11); + *(b + 11) = data15; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); + data09 = *(a3 + 0); + data13 = *(a4 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data03; + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data03; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_6.c b/kernel/generic/trsm_utcopy_6.c new file mode 100644 index 000000000..bbba78d53 --- /dev/null +++ b/kernel/generic/trsm_utcopy_6.c @@ -0,0 +1,322 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = INV(data11); + + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data03; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 8ebd42244..bbec23ccf 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -16,15 +16,17 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S -DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S -DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S -DGEMMONCOPY = gemm_ncopy_2_bulldozer.S -DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S + +DGEMMKERNEL = dgemm_kernel_6x4_piledriver.S +DGEMMINCOPY = ../generic/gemm_ncopy_6.c +DGEMMITCOPY = ../generic/gemm_tcopy_6.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c diff --git a/kernel/x86_64/dgemm_kernel_6x4_piledriver.S b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S new file mode 100644 index 000000000..7b5dd1587 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S @@ -0,0 +1,1734 @@ +/**************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +// register blocking= 6x4. unloop k = 4. +// Use FMA3 on piledriver. +// Todo: 1) deal with the edge. 2) Add windows abi. + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 128 +#define oldbk_i %rdi +#define oldbk_j %rsi +#define oldbk_l %rdx + +#define _bk_i %r13 +#define _bk_j %r14 +#define _bk_l %r15 + +#define ALPHA %xmm0 +#define _ptr_A %rcx +#define _ptr_B %r8 +#define _ptr_C %r9 +#define LDC %r10 + +#define i %r11 +#define k %rax +#define _pre_B %r12 +#define _ptr__A_0 %rdi +#define _ptr__B_0 %rsi +#define _ptr__C_0 %rbx +#define _ptr__C_1 %rbp + +#define old_ldc 8+STACKSIZE(%rsp) +#define alpha 48(%rsp) +#define j 56(%rsp) + +#define MOVQ2560(s,d) movq s,d +#define LEAQ2560(s,d) leaq s,d +#define SARQ2560(imm,n) sarq imm,n +#define ADDQ2560(off,addr) addq off,addr +#define SUBQ2560(off,addr) subq off,addr +#define DIVQ2560(off,addr) divq off,addr +#define MULQ2560(s,d) mulq s,d +#define DECQ2560(addr) decq addr +#define NEGQ2560(s) negq s +#define TESTQ2560(n,addr) testq n,addr +#define SALQ2560(imm,n) salq imm,n + +#define MOVQ1280(s,d) movq s,d +#define LEAQ1280(s,d) leaq s,d +#define SARQ1280(imm,n) sarq imm,n +#define ADDQ1280(off,addr) addq off,addr +#define SUBQ1280(off,addr) subq off,addr +#define DIVQ1280(off,addr) divq off,addr +#define CMPQ1280(off,addr) cmpq off,addr +#define MULQ1280(s,d) mulq s,d +#define DECQ1280(addr) decq addr +#define NEGQ1280(s) negq s +#define TESTQ1280(n,addr) testq n,addr +#define SALQ1280(imm,n) salq imm,n + +#define JG jg +#define JLE jle + +#define VLD2560(addr,reg) vmovapd addr,reg +#define VST2560(reg,addr) vmovapd reg,addr +#define VMUL2560(a,b,c) vmulpd a,b,c +#define MVMUL2560(a,b,c) vmulpd b,a,c +#define VADD2560(a,b,c) vaddpd a,b,c +#define MVADD2560(a,b,c) vaddpd b,a,c +#define VSHUF2560(imm,s,d) vpermilpd imm,s,d +#define VSHUF2F2560(imm,s1,s2,d) vperm2f128 imm,s1,s2,d +#define BROAD2560(addr,reg) vbroadcastsd addr,reg +#define MOVRR2560(a,b) vmovapd a,b +#define REVS2560(imm,s1,s2,d) vshufpd imm,s1,s2,d +#define EXTR2561(imm,a,b) vextractf128 imm,a,b +#define LDL2561(addr,reg) vmovlpd addr,reg,reg +#define LDH2561(addr,reg) vmovhpd addr,reg,reg +#define STL2561(reg,addr) vmovlpd reg,addr +#define STH2561(reg,addr) vmovhpd reg,addr +#define VADD2561(a,b,c) vaddpd a,b,c +#define VXOR2560(a,b,c) vxorpd a,b,c +#define PREFETCH02560(addr,b) prefetcht0 addr +#define PREFETCH12560(addr,b) prefetcht0 addr +#define PREFETCH22560(addr,b) prefetcht2 addr +#define PREFETCHW2560(addr,b) prefetchw addr +#define PREFETCHN2560(addr,b) prefetchnta addr +#define VMA2560(a,b,c,d) vfmaddpd d,a,b,c +#define MVMA2560(a,b,c,d) vfmaddpd d,a,b,c + +#define VLD1280(addr,reg) vmovapd addr,reg +#define VLD1282(addr,reg) vmovapd addr,reg +#define VLD1281(addr,reg) movsd addr,reg +#define VST1280(reg,addr) vmovapd reg,addr +#define VST1282(reg,addr) vmovapd reg,addr +#define VST1281(reg,addr) movsd reg,addr +#define VLDU1282(addr,reg) vmovupd addr,reg +#define VLDU1281(addr,reg) movsd addr,reg +#define VSTU1282(reg,addr) vmovupd reg,addr +#define VSTU1281(reg,addr) movsd reg,addr +#define VMUL1280(a,b,c) vmulpd a,b,c +#define VMUL1282(a,b,c) vmulpd a,b,c +#define VMUL1281(a,b,c) vmulpd a,b,c +#define MVMUL1280(a,b,c) vmulpd b,a,c +#define VADD1280(a,b,c) vaddpd a,b,c +#define MVADD1280(a,b,c) vaddpd b,a,c +#define VSHUF1280(imm,s,d) vpermilpd imm,s,d +#define VSHUF2F1280(imm,s1,s2,d) vperm2f128 imm,s1,s2,d +#define BROAD1280(addr,reg) vmovddup addr,reg +#define BROAD1282(addr,reg) vmovddup addr,reg +#define BROAD1281(addr,reg) movddup addr,reg +#define MOVRR1280(a,b) vmovapd a,b +#define REVS1280(imm,s1,s2,d) vshufpd imm,s1,s2,d +#define EXTR1281(imm,a,b) vextractf128 imm,a,b +#define LDL1281(addr,reg) vmovlpd addr,reg,reg +#define LDH1281(addr,reg) vmovhpd addr,reg,reg +#define STL1281(reg,addr) vmovlpd reg,addr +#define STH1281(reg,addr) vmovhpd reg,addr +#define VADD1281(a,b,c) vaddpd a,b,c +#define VXOR1280(a,b,c) vxorpd a,b,c +#define VXOR1282(a,b,c) vxorpd a,b,c +#define VXOR1281(a,b,c) vxorpd a,b,c +#define PREFETCH01280(addr,b) prefetcht0 addr +#define PREFETCH11280(addr,b) prefetcht0 addr +#define PREFETCH21280(addr,b) prefetcht2 addr +#define PREFETCHW1280(addr,b) prefetchw addr +#define PREFETCHN1280(addr,b) prefetchnta addr +#define VMA1280(a,b,c,d) vfmaddpd d,a,b,c +#define VMA1282(a,b,c,d) vfmadd231pd a,b,c +#define VMA1281(a,b,c,d) vfmadd231pd a,b,c +#define VMA21282(a,b,c,d) vfmadd231pd a,b,c +#define VMA21281(a,b,c,d) vfmadd231pd a,b,c +//#define VMA1282(a,b,c,d) nop +//#define VMA1281(a,b,c,d) nop +//#define VMA21282(a,b,c,d) nop +//#define VMA21281(a,b,c,d) nop +#define MVMA1280(a,b,c,d) vfmaddpd d,a,b,c + +#define imm1 $0x05 +#define imm3 $0x05 +#define imm100 $0x05 +#define imm200 $0x0a + +#define XMM0 %xmm0 +#define XMM1 %xmm1 +#define XMM2 %xmm2 +#define XMM3 %xmm3 +#define XMM4 %xmm4 +#define XMM5 %xmm5 +#define XMM6 %xmm6 +#define XMM7 %xmm7 +#define XMM8 %xmm8 +#define XMM9 %xmm9 +#define XMM10 %xmm10 +#define XMM11 %xmm11 +#define XMM12 %xmm12 +#define XMM13 %xmm13 +#define XMM14 %xmm14 +#define XMM15 %xmm15 + +#define YMM0 %ymm0 +#define YMM1 %ymm1 +#define YMM2 %ymm2 +#define YMM3 %ymm3 +#define YMM4 %ymm4 +#define YMM5 %ymm5 +#define YMM6 %ymm6 +#define YMM7 %ymm7 +#define YMM8 %ymm8 +#define YMM9 %ymm9 +#define YMM10 %ymm10 +#define YMM11 %ymm11 +#define YMM12 %ymm12 +#define YMM13 %ymm13 +#define YMM14 %ymm14 +#define YMM15 %ymm15 +PROLOGUE + +subq $STACKSIZE, %rsp; +movq %rbx, 0(%rsp); +movq %rbp, 8(%rsp); +movq %r12, 16(%rsp); +movq %r13, 24(%rsp); +movq %r14, 32(%rsp); +movq %r15, 40(%rsp); +vzeroupper +movl old_ldc, %eax +movq %rax, LDC +movlps ALPHA, alpha +movq oldbk_i, _bk_i +movq oldbk_j, _bk_j +movq oldbk_l, _bk_l +leaq (, LDC, SIZE), LDC + +MOVQ1280(_bk_j,j); +SARQ1280($2,j); +JLE ._L_0_loopE; +ALIGN_4; +._L_0_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +LEAQ1280((_ptr_C,LDC,2),_ptr__C_1); +MOVQ1280(_bk_l,%rax); +SALQ1280($5,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_1_loopE; +._L_1_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +VXOR1282(XMM6,XMM6,XMM6); +VXOR1282(XMM7,XMM7,XMM7); +VXOR1282(XMM8,XMM8,XMM8); +VXOR1282(XMM9,XMM9,XMM9); +VXOR1282(XMM10,XMM10,XMM10); +VXOR1282(XMM11,XMM11,XMM11); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_2_loopE; +ALIGN_4; +._L_2_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_2_bodyE:; +DECQ1280(k); +JG ._L_2_bodyB; +ALIGN_4; +._L_2_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_3_loopE; +ALIGN_4; +._L_3_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_3_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_4_loopE; +ALIGN_4; +._L_4_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_4_loopE:; +BROAD1282(alpha,XMM12); +VLDU1282(0*SIZE(_ptr__C_0),XMM13); +VMA21282(XMM12,XMM0,XMM13,XMM0); +VSTU1282(XMM13,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM14); +VMA21282(XMM12,XMM1,XMM14,XMM1); +VSTU1282(XMM14,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM15); +VMA21282(XMM12,XMM2,XMM15,XMM2); +VSTU1282(XMM15,4*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM13); +VMA21282(XMM12,XMM3,XMM13,XMM3); +VSTU1282(XMM13,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM14); +VMA21282(XMM12,XMM4,XMM14,XMM4); +VSTU1282(XMM14,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM15); +VMA21282(XMM12,XMM5,XMM15,XMM5); +VSTU1282(XMM15,4*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM13); +VMA21282(XMM12,XMM6,XMM13,XMM6); +VSTU1282(XMM13,0*SIZE(_ptr__C_1)); +VLDU1282(2*SIZE(_ptr__C_1),XMM14); +VMA21282(XMM12,XMM7,XMM14,XMM7); +VSTU1282(XMM14,2*SIZE(_ptr__C_1)); +VLDU1282(4*SIZE(_ptr__C_1),XMM15); +VMA21282(XMM12,XMM8,XMM15,XMM8); +VSTU1282(XMM15,4*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM13); +VMA21282(XMM12,XMM9,XMM13,XMM9); +VSTU1282(XMM13,0*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM14); +VMA21282(XMM12,XMM10,XMM14,XMM10); +VSTU1282(XMM14,2*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_1,LDC,1),XMM15); +VMA21282(XMM12,XMM11,XMM15,XMM11); +VSTU1282(XMM15,4*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_1_bodyE:; +SUBQ1280($6,i); +JG ._L_1_bodyB; +ALIGN_4; +._L_1_loopE:; +TESTQ1280($4,i); +JLE ._L_5_loopE; +ALIGN_4; +._L_5_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +VXOR1282(XMM6,XMM6,XMM6); +VXOR1282(XMM7,XMM7,XMM7); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_6_loopE; +ALIGN_4; +._L_6_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_6_bodyE:; +DECQ1280(k); +JG ._L_6_bodyB; +ALIGN_4; +._L_6_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_7_loopE; +ALIGN_4; +._L_7_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_7_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_8_loopE; +ALIGN_4; +._L_8_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_8_loopE:; +BROAD1282(alpha,XMM8); +VLDU1282(0*SIZE(_ptr__C_0),XMM9); +VMA21282(XMM8,XMM0,XMM9,XMM0); +VSTU1282(XMM9,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM10); +VMA21282(XMM8,XMM1,XMM10,XMM1); +VSTU1282(XMM10,2*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM11); +VMA21282(XMM8,XMM2,XMM11,XMM2); +VSTU1282(XMM11,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM12); +VMA21282(XMM8,XMM3,XMM12,XMM3); +VSTU1282(XMM12,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM13); +VMA21282(XMM8,XMM4,XMM13,XMM4); +VSTU1282(XMM13,0*SIZE(_ptr__C_1)); +VLDU1282(2*SIZE(_ptr__C_1),XMM14); +VMA21282(XMM8,XMM5,XMM14,XMM5); +VSTU1282(XMM14,2*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM15); +VMA21282(XMM8,XMM6,XMM15,XMM6); +VSTU1282(XMM15,0*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM9); +VMA21282(XMM8,XMM7,XMM9,XMM7); +VSTU1282(XMM9,2*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_5_loopE:; +TESTQ1280($2,i); +JLE ._L_9_loopE; +ALIGN_4; +._L_9_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_10_loopE; +ALIGN_4; +._L_10_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_10_bodyE:; +DECQ1280(k); +JG ._L_10_bodyB; +ALIGN_4; +._L_10_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_11_loopE; +ALIGN_4; +._L_11_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_11_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_12_loopE; +ALIGN_4; +._L_12_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_12_loopE:; +BROAD1282(alpha,XMM4); +VLDU1282(0*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM4,XMM0,XMM5,XMM0); +VSTU1282(XMM5,0*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM6); +VMA21282(XMM4,XMM1,XMM6,XMM1); +VSTU1282(XMM6,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM7); +VMA21282(XMM4,XMM2,XMM7,XMM2); +VSTU1282(XMM7,0*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM8); +VMA21282(XMM4,XMM3,XMM8,XMM3); +VSTU1282(XMM8,0*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_9_loopE:; +TESTQ1280($1,i); +JLE ._L_13_loopE; +ALIGN_4; +._L_13_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +VXOR1281(XMM1,XMM1,XMM1); +VXOR1281(XMM2,XMM2,XMM2); +VXOR1281(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_14_loopE; +ALIGN_4; +._L_14_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(8*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(9*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(10*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(11*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(12*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(13*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(14*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(15*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_14_bodyE:; +DECQ1280(k); +JG ._L_14_bodyB; +ALIGN_4; +._L_14_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_15_loopE; +ALIGN_4; +._L_15_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_15_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_16_loopE; +ALIGN_4; +._L_16_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_16_loopE:; +BROAD1281(alpha,XMM4); +VLDU1281(0*SIZE(_ptr__C_0),XMM5); +VMA21281(XMM4,XMM0,XMM5,XMM0); +VSTU1281(XMM5,0*SIZE(_ptr__C_0)); +VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM6); +VMA21281(XMM4,XMM1,XMM6,XMM1); +VSTU1281(XMM6,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1281(0*SIZE(_ptr__C_1),XMM7); +VMA21281(XMM4,XMM2,XMM7,XMM2); +VSTU1281(XMM7,0*SIZE(_ptr__C_1)); +VLDU1281(0*SIZE(_ptr__C_1,LDC,1),XMM8); +VMA21281(XMM4,XMM3,XMM8,XMM3); +VSTU1281(XMM8,0*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_13_loopE:; +MOVQ1280(LDC,%rax); +SALQ1280($2,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($5,%rax); +ADDQ1280(%rax,_ptr_B); +._L_0_bodyE:; +DECQ1280(j); +JG ._L_0_bodyB; +ALIGN_4; +._L_0_loopE:; +TESTQ1280($2,_bk_j); +JLE ._L_17_loopE; +ALIGN_4; +._L_17_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +LEAQ1280((_ptr_C,LDC,1),_ptr__C_1); +MOVQ1280(_bk_l,%rax); +SALQ1280($4,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_18_loopE; +._L_18_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_19_loopE; +ALIGN_4; +._L_19_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_19_bodyE:; +DECQ1280(k); +JG ._L_19_bodyB; +ALIGN_4; +._L_19_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_20_loopE; +ALIGN_4; +._L_20_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_20_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_21_loopE; +ALIGN_4; +._L_21_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_21_loopE:; +BROAD1282(alpha,XMM6); +VLDU1282(0*SIZE(_ptr__C_0),XMM7); +VMA21282(XMM6,XMM0,XMM7,XMM0); +VSTU1282(XMM7,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM8); +VMA21282(XMM6,XMM1,XMM8,XMM1); +VSTU1282(XMM8,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM9); +VMA21282(XMM6,XMM2,XMM9,XMM2); +VSTU1282(XMM9,4*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM10); +VMA21282(XMM6,XMM3,XMM10,XMM3); +VSTU1282(XMM10,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM11); +VMA21282(XMM6,XMM4,XMM11,XMM4); +VSTU1282(XMM11,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM12); +VMA21282(XMM6,XMM5,XMM12,XMM5); +VSTU1282(XMM12,4*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_18_bodyE:; +SUBQ1280($6,i); +JG ._L_18_bodyB; +ALIGN_4; +._L_18_loopE:; +TESTQ1280($4,i); +JLE ._L_22_loopE; +ALIGN_4; +._L_22_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_23_loopE; +ALIGN_4; +._L_23_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_23_bodyE:; +DECQ1280(k); +JG ._L_23_bodyB; +ALIGN_4; +._L_23_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_24_loopE; +ALIGN_4; +._L_24_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_24_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_25_loopE; +ALIGN_4; +._L_25_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_25_loopE:; +BROAD1282(alpha,XMM4); +VLDU1282(0*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM4,XMM0,XMM5,XMM0); +VSTU1282(XMM5,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM6); +VMA21282(XMM4,XMM1,XMM6,XMM1); +VSTU1282(XMM6,2*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM7); +VMA21282(XMM4,XMM2,XMM7,XMM2); +VSTU1282(XMM7,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM8); +VMA21282(XMM4,XMM3,XMM8,XMM3); +VSTU1282(XMM8,2*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_22_loopE:; +TESTQ1280($2,i); +JLE ._L_26_loopE; +ALIGN_4; +._L_26_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_27_loopE; +ALIGN_4; +._L_27_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_27_bodyE:; +DECQ1280(k); +JG ._L_27_bodyB; +ALIGN_4; +._L_27_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_28_loopE; +ALIGN_4; +._L_28_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_28_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_29_loopE; +ALIGN_4; +._L_29_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_29_loopE:; +BROAD1282(alpha,XMM2); +VLDU1282(0*SIZE(_ptr__C_0),XMM3); +VMA21282(XMM2,XMM0,XMM3,XMM0); +VSTU1282(XMM3,0*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM4); +VMA21282(XMM2,XMM1,XMM4,XMM1); +VSTU1282(XMM4,0*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_26_loopE:; +TESTQ1280($1,i); +JLE ._L_30_loopE; +ALIGN_4; +._L_30_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +VXOR1281(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_31_loopE; +ALIGN_4; +._L_31_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_31_bodyE:; +DECQ1280(k); +JG ._L_31_bodyB; +ALIGN_4; +._L_31_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_32_loopE; +ALIGN_4; +._L_32_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_32_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_33_loopE; +ALIGN_4; +._L_33_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_33_loopE:; +BROAD1281(alpha,XMM2); +VLDU1281(0*SIZE(_ptr__C_0),XMM3); +VMA21281(XMM2,XMM0,XMM3,XMM0); +VSTU1281(XMM3,0*SIZE(_ptr__C_0)); +VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM4); +VMA21281(XMM2,XMM1,XMM4,XMM1); +VSTU1281(XMM4,0*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_30_loopE:; +MOVQ1280(LDC,%rax); +SALQ1280($1,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($4,%rax); +ADDQ1280(%rax,_ptr_B); +._L_17_loopE:; +TESTQ1280($1,_bk_j); +JLE ._L_34_loopE; +ALIGN_4; +._L_34_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +MOVQ1280(_bk_l,%rax); +SALQ1280($3,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_35_loopE; +._L_35_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_36_loopE; +ALIGN_4; +._L_36_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_36_bodyE:; +DECQ1280(k); +JG ._L_36_bodyB; +ALIGN_4; +._L_36_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_37_loopE; +ALIGN_4; +._L_37_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_37_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_38_loopE; +ALIGN_4; +._L_38_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_38_loopE:; +BROAD1282(alpha,XMM3); +VLDU1282(0*SIZE(_ptr__C_0),XMM4); +VMA21282(XMM3,XMM0,XMM4,XMM0); +VSTU1282(XMM4,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM3,XMM1,XMM5,XMM1); +VSTU1282(XMM5,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM6); +VMA21282(XMM3,XMM2,XMM6,XMM2); +VSTU1282(XMM6,4*SIZE(_ptr__C_0)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_35_bodyE:; +SUBQ1280($6,i); +JG ._L_35_bodyB; +ALIGN_4; +._L_35_loopE:; +TESTQ1280($4,i); +JLE ._L_39_loopE; +ALIGN_4; +._L_39_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_40_loopE; +ALIGN_4; +._L_40_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_40_bodyE:; +DECQ1280(k); +JG ._L_40_bodyB; +ALIGN_4; +._L_40_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_41_loopE; +ALIGN_4; +._L_41_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_41_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_42_loopE; +ALIGN_4; +._L_42_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_42_loopE:; +BROAD1282(alpha,XMM2); +VLDU1282(0*SIZE(_ptr__C_0),XMM3); +VMA21282(XMM2,XMM0,XMM3,XMM0); +VSTU1282(XMM3,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM4); +VMA21282(XMM2,XMM1,XMM4,XMM1); +VSTU1282(XMM4,2*SIZE(_ptr__C_0)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_39_loopE:; +TESTQ1280($2,i); +JLE ._L_43_loopE; +ALIGN_4; +._L_43_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_44_loopE; +ALIGN_4; +._L_44_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_44_bodyE:; +DECQ1280(k); +JG ._L_44_bodyB; +ALIGN_4; +._L_44_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_45_loopE; +ALIGN_4; +._L_45_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_45_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_46_loopE; +ALIGN_4; +._L_46_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_46_loopE:; +BROAD1282(alpha,XMM1); +VLDU1282(0*SIZE(_ptr__C_0),XMM2); +VMA21282(XMM1,XMM0,XMM2,XMM0); +VSTU1282(XMM2,0*SIZE(_ptr__C_0)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_43_loopE:; +TESTQ1280($1,i); +JLE ._L_47_loopE; +ALIGN_4; +._L_47_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_48_loopE; +ALIGN_4; +._L_48_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_48_bodyE:; +DECQ1280(k); +JG ._L_48_bodyB; +ALIGN_4; +._L_48_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_49_loopE; +ALIGN_4; +._L_49_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_49_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_50_loopE; +ALIGN_4; +._L_50_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_50_loopE:; +BROAD1281(alpha,XMM1); +VLDU1281(0*SIZE(_ptr__C_0),XMM2); +VMA21281(XMM1,XMM0,XMM2,XMM0); +VSTU1281(XMM2,0*SIZE(_ptr__C_0)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_47_loopE:; +MOVQ1280(LDC,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($3,%rax); +ADDQ1280(%rax,_ptr_B); +._L_34_loopE:; +vzeroupper +movq 0(%rsp), %rbx; +movq 8(%rsp), %rbp; +movq 16(%rsp), %r12; +movq 24(%rsp), %r13; +movq 32(%rsp), %r14; +movq 40(%rsp), %r15; +addq $STACKSIZE, %rsp; +ret + +EPILOGUE diff --git a/param.h b/param.h index 0c3df6951..c7a9635a6 100644 --- a/param.h +++ b/param.h @@ -330,9 +330,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 6 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 @@ -347,10 +347,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 480 #else #define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 224 +#define DGEMM_DEFAULT_P 480 #endif #define QGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 @@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 168 -#define DGEMM_DEFAULT_Q 168 +#define DGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 From 2840d56aebd7ba028839cdfd8bbc2405181987de Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 19 Oct 2013 09:47:15 +0200 Subject: [PATCH 02/15] added dgemm_kernel for Piledriver --- common.h | 7 + driver/level3/level3.c | 2 +- driver/level3/level3_thread.c | 2 +- kernel/x86_64/KERNEL.PILEDRIVER | 10 +- kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 4599 +++++++++++++++++++ param.h | 10 +- 6 files changed, 4618 insertions(+), 12 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_8x2_piledriver.S diff --git a/common.h b/common.h index fa4c1d745..5c97fbec1 100644 --- a/common.h +++ b/common.h @@ -310,6 +310,13 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif +/*************************************************** +Some no-oprations are enough +***************************************************/ +#ifdef PILEDRIVER +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif + #ifndef YIELDING #define YIELDING sched_yield() #endif diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe889527..edba5359e 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) +#if ( defined(BULLDOZER) || defined(PILEDRIVER) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; else if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790fa..850580504 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) +#if ( defined(BULLDOZER) || defined(PILEDRIVER) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; else if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index bbec23ccf..c2ed50ba1 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -17,11 +17,11 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_6x4_piledriver.S -DGEMMINCOPY = ../generic/gemm_ncopy_6.c -DGEMMITCOPY = ../generic/gemm_tcopy_6.c -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S +DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S +DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S +DGEMMONCOPY = gemm_ncopy_2_bulldozer.S +DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S new file mode 100644 index 000000000..c719e96fc --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -0,0 +1,4599 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/********************************************************************* +* +* 2013/10/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/18 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 384 +* DGEMM_Q 168 +* DGEMM_R 12288 +* A_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 6144x6912 84.1 GFLOPS with 8 threads on 4 modules (ACML: 81.4 GFLOPS) +* 6144x6912 81.2 GFLOPS with 4 threads on 4 modules (ACML: 81.3 GFLOPS) +* 6144x6912 40.9 GFLOPS with 2 threads on 2 modules (ACML: 41.8 GFLOPS) +* 6144x6912 20.5 GFLOPS with 1 threads on 1 modules (ACML: 21.0 GFLOPS) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 12288x13824 244.5 GFLOPS with 32 threads on 16 modules (ACML: 120.3 GFLOPS) !strange thermal behavior +* 12288x13824 233.9 GFLOPS with 16 threads on 16 modules (ACML: 129.5 GFLOPS) !strange thermal behavior +* 12288x13824 138.1 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) +* 6144x6912 73.6 GFLOPS with 4 threads on 4 modules (ACML: 59.4 GFLOPS) +* 6144x6912 36.8 GFLOPS with 2 threads on 2 modules (ACML: 34.9 GFLOPS) +* 6144x6912 18.7 GFLOPS with 1 threads on 1 modules (ACML: 18.7 GFLOPS) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER1) + +.macro VFMADD231PD_ y1,y2,y0 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x1,x2,x0 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y1,y2,y0 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x1,x2,x0 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + + + +#define A_PR1 256 +#define B_PR1 256 +#define C_PR1 256 + +.macro INIT8x3 + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 +.endm + +.macro KERNEL8x3_INIT + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmulpd %xmm1,%xmm0,%xmm4 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm2,%xmm0,%xmm5 + vmovddup -10 * SIZE(BO), %xmm3 + vmulpd %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm7 + vmulpd %xmm2,%xmm0,%xmm8 + vmulpd %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm10 + vmulpd %xmm2,%xmm0,%xmm11 + addq $3*SIZE, BO + vmulpd %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm13 + vmovddup -12 * SIZE(BO), %xmm1 + vmulpd %xmm2,%xmm0,%xmm14 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup -12 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M2 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup -9 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup -8 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M3 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup -7 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup -6 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup -5 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M4 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup -4 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup -3 * SIZE(BO), %xmm1 + addq $32 * SIZE, AO + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup -2 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M5 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmovddup -1 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup 0 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup 1 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M6 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup 2 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup 3 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup 4 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M7 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup 5 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup 6 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup 7 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_M8 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + vmovddup 9 * SIZE(BO), %xmm1 + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + vmovddup 10 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm3,%xmm0,%xmm15 + vmovddup 11 * SIZE(BO), %xmm3 + addq $32 * SIZE, AO + addq $24 * SIZE, BO +.endm + + +.macro KERNEL8x3_E + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + addq $32*SIZE, AO + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + addq $21*SIZE, BO + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro KERNEL8x3_SUBN + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm4 + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_ %xmm2,%xmm0,%xmm5 + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_ %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm7 + VFMADD231PD_ %xmm2,%xmm0,%xmm8 + VFMADD231PD_ %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm10 + VFMADD231PD_ %xmm2,%xmm0,%xmm11 + VFMADD231PD_ %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_ %xmm1,%xmm0,%xmm13 + addq $3*SIZE, BO + VFMADD231PD_ %xmm2,%xmm0,%xmm14 + addq $8*SIZE, AO + VFMADD231PD_ %xmm3,%xmm0,%xmm15 +.endm + +.macro SAVE8x3 + vmovddup ALPHA, %xmm0 + + prefetcht0 C_PR1(CO1) + prefetcht0 C_PR1(CO1,LDC) + prefetcht0 C_PR1(CO1,LDC,2) + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 8 +.endm + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + + + + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + + + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + + + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 + movq B, BO1 + leaq (B,%rax,8), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L6_02a + ALIGN_4 + +.L6_02: + prefetcht0 B_PR1(BO1) + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm2 + vmovups 4*SIZE(BO1), %xmm4 + vmovups 6*SIZE(BO1), %xmm6 + vmovsd (BO2), %xmm1 + vmovsd 2*SIZE(BO2), %xmm3 + vmovsd 4*SIZE(BO2), %xmm5 + vmovsd 6*SIZE(BO2), %xmm7 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_02 + +.L6_02a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_02c + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax,8), BO1 // next offset to BO1 + leaq (BO1,%rax,8), BO2 // next offset to BO1 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $2, %rax // k / 4 + jz .L6_03a + ALIGN_4 + + +.L6_03: + + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO2), %xmm0 + vmovups 2*SIZE(BO2), %xmm2 + vmovups 4*SIZE(BO2), %xmm4 + vmovups 6*SIZE(BO2), %xmm6 + vmovsd 1*SIZE(BO1), %xmm1 + vmovsd 3*SIZE(BO1), %xmm3 + vmovsd 5*SIZE(BO1), %xmm5 + vmovsd 7*SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_03 + +.L6_03a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_03c + ALIGN_4 + + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L6_20 + + ALIGN_4 + +.L6_11: + + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L6_13 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L6_12: + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + //je .L6_12_E + + jne .L6_12 + +.L6_12_E: + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + +.L6_13: + + test $2, %rax + jz .L6_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + + +.L6_14: + + test $1, %rax + jz .L6_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + + jmp .L6_16 + +.L6_15: + + INIT8x3 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL8x3_SUBN + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE8x3 + + decq I # i -- + jg .L6_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $7, M + jz .L7_10 // to next 3 lines of N + + testq $4, M + jz .L6_30 + + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L7_20 + ALIGN_4 + +.L7_11: + + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L7_13 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L7_12: + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + //je .L7_12_E + + jne .L7_12 + +.L7_12_E: + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_13: + + test $2, %rax + jz .L7_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + +.L7_14: + + test $1, %rax + jz .L7_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_15: + + INIT8x3 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + + ALIGN_4 + +.L7_17: + + KERNEL8x3_SUBN + dec %rax + jne .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE8x3 + + decq I # i -- + jg .L7_11 + ALIGN_4 + +.L7_20: + // Test rest of M + + testq $7, M + jz .L7_60 // to next 6 lines of N + + testq $4, M + jz .L7_30 + + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + +.L7_40: + testq $1, M + jz .L7_60 // to next 6 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + //prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + //prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + + vmovsd %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_0: + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + vmulpd %xmm0, %xmm11,%xmm11 + vmulpd %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + vmulsd %xmm0, %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + //prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#endif diff --git a/param.h b/param.h index c7a9635a6..adfe867f8 100644 --- a/param.h +++ b/param.h @@ -330,9 +330,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_M 6 +#define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 @@ -347,7 +347,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 480 +#define DGEMM_DEFAULT_P 384 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 480 @@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 168 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 @@ -371,7 +371,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_R sgemm_r #define QGEMM_DEFAULT_R qgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r From 7bccff1512f1e03c8c9e7598ca1879f72443b48f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 29 Oct 2013 22:53:04 +0100 Subject: [PATCH 03/15] added sgemm_kernel for PILEDRIVER --- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/sgemm_kernel_16x2_piledriver.S | 5258 ++++++++++++++++++ param.h | 4 +- 3 files changed, 5261 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/sgemm_kernel_16x2_piledriver.S diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index c2ed50ba1..2bc0ae709 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -7,7 +7,7 @@ DAXPYKERNEL = daxpy_bulldozer.S DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S -SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S +SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S diff --git a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S new file mode 100644 index 000000000..dcfed6bc5 --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S @@ -0,0 +1,5258 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2013/10/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/29 Saar +* +* Parameter: +* UNROLL_M 16 +* UNROLL_N 2 +* SGEMM_P 768 +* SGEMM_Q 192 +* SGEMM_R 12288 +* A_PR1 384 +* B_PR1 192 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) +* 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) +* 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) +* 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) +* 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) +* 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) +* 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) +* 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) +* 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +#define KERNEL16x3_1(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_2(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_3(xx) \ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_4(xx) \ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL16x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL16x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovss 0 * SIZE(BO2), %xmm2 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovss 1*SIZE(BO1), %xmm0 + vmovsd 0*SIZE(BO2), %xmm1 + vmovss %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + + vmovss %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + vmulps %xmm0, %xmm11,%xmm11 + vmulps %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + vmulss %xmm0, %xmm5,%xmm5 + vmulss %xmm0, %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulps %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulss %xmm0, %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/param.h b/param.h index adfe867f8..a833085d3 100644 --- a/param.h +++ b/param.h @@ -358,7 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 @@ -369,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 -#define SGEMM_DEFAULT_R sgemm_r +#define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r From 1cf4b974b2dd271f4014550190a9e05e5bca229c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 30 Oct 2013 09:12:17 +0100 Subject: [PATCH 04/15] added zgemm_kernel for Piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/zgemm_kernel_2x2_piledriver.S | 1428 +++++++++++++++++++ 2 files changed, 1429 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/zgemm_kernel_2x2_piledriver.S diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 2bc0ae709..5a5b42b64 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -36,7 +36,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S +ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S new file mode 100644 index 000000000..9f1392d78 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S @@ -0,0 +1,1428 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2013/10/30 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/30 Saar +* +* Parameter: +* UNROLL_M 2 +* UNROLL_N 2 +* ZGEMM_P 384 +* ZGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) +* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) +* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) +* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) +* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) +* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) +* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) +* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) +* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE From e172b70ea29054f4353dfcd0e6f0120e50b5a100 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 31 Oct 2013 08:38:17 +0100 Subject: [PATCH 05/15] added cgemm_kernel for Piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/cgemm_kernel_4x2_piledriver.S | 1920 +++++++++++++++++++ param.h | 12 +- 3 files changed, 1929 insertions(+), 5 deletions(-) create mode 100644 kernel/x86_64/cgemm_kernel_4x2_piledriver.S diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 5a5b42b64..6c262c774 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -27,7 +27,7 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S +CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c diff --git a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S new file mode 100644 index 000000000..931316285 --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S @@ -0,0 +1,1920 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +/********************************************************************* +* +* 2013/10/31 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 4 +* UNROLL_N 2 +* CGEMM_P 768 +* CGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) +* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) +* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) +* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) +* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) +* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) +* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) +* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) +* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddps +#define VFMADD_I vfmaddps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddps +#define VFMADD_I vfmaddps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddps +#define VFMADD_I vfnmaddps +#else +#define VFMADD_R vfnmaddps +#define VFMADD_I vfnmaddps +#endif + + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + + + +/************************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $8, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_20: + testq $3, M + jz .L2_60 // to next 2 lines of N + + testq $2, M + jz .L2_40 + ALIGN_4 + +.L2_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL2x2_SUB(xxx) + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + + vmovsd %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_20: + testq $3, M + jz .L999 + + testq $2, M + jz .L1_40 + ALIGN_4 + +.L1_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL2x1_SUB(xxx) + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/param.h b/param.h index a833085d3..a6990f7c3 100644 --- a/param.h +++ b/param.h @@ -348,25 +348,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 -#define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 #define SGEMM_DEFAULT_R 12288 From 5118a7f4d1f701a23f76c7c4cbcdead7d5566b86 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 31 Oct 2013 11:53:26 +0100 Subject: [PATCH 06/15] small optimizations on dgemm_kernel for Piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 5 +- kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 142 +++++--------------- param.h | 2 +- 3 files changed, 41 insertions(+), 108 deletions(-) diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 6c262c774..abed953c3 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -54,9 +54,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S index c719e96fc..cc0ebef8a 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -28,37 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /********************************************************************* * -* 2013/10/18 Saar +* 2013/10/31 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/10/18 Saar +* 2013/10/31 Saar * * Parameter: * UNROLL_M 8 * UNROLL_N 2 -* DGEMM_P 384 +* DGEMM_P 768 * DGEMM_Q 168 * DGEMM_R 12288 -* A_PR1 256 +* A_PR1 512 +* B_PR1 256 * * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * -* 6144x6912 84.1 GFLOPS with 8 threads on 4 modules (ACML: 81.4 GFLOPS) -* 6144x6912 81.2 GFLOPS with 4 threads on 4 modules (ACML: 81.3 GFLOPS) -* 6144x6912 40.9 GFLOPS with 2 threads on 2 modules (ACML: 41.8 GFLOPS) -* 6144x6912 20.5 GFLOPS with 1 threads on 1 modules (ACML: 21.0 GFLOPS) +* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) +* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) * * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * -* 12288x13824 244.5 GFLOPS with 32 threads on 16 modules (ACML: 120.3 GFLOPS) !strange thermal behavior -* 12288x13824 233.9 GFLOPS with 16 threads on 16 modules (ACML: 129.5 GFLOPS) !strange thermal behavior -* 12288x13824 138.1 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) -* 6144x6912 73.6 GFLOPS with 4 threads on 4 modules (ACML: 59.4 GFLOPS) -* 6144x6912 36.8 GFLOPS with 2 threads on 2 modules (ACML: 34.9 GFLOPS) -* 6144x6912 18.7 GFLOPS with 1 threads on 1 modules (ACML: 18.7 GFLOPS) +* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior +* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior +* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) +* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) +* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) +* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) * *********************************************************************/ @@ -168,9 +169,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define A_PR1 256 +#define A_PR1 512 #define B_PR1 256 -#define C_PR1 256 +#define C_PR1 64 .macro INIT8x3 vxorpd %xmm4 , %xmm4 , %xmm4 @@ -454,9 +455,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE8x3 vmovddup ALPHA, %xmm0 - prefetcht0 C_PR1(CO1) - prefetcht0 C_PR1(CO1,LDC) - prefetcht0 C_PR1(CO1,LDC,2) vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 @@ -489,6 +487,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + prefetcht0 C_PR1(CO1) + prefetcht0 C_PR1(CO1,LDC) + prefetcht0 C_PR1(CO1,LDC,2) + addq $8 * SIZE, CO1 # coffset += 8 .endm @@ -1284,6 +1286,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpq $3, %rax jl .L6_13 + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 @@ -1299,22 +1304,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_12: + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 + prefetcht0 B_PR1+104(BO) KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 dec %rax - //je .L6_12_E - jne .L6_12 .L6_12_E: + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 @@ -1432,31 +1440,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) @@ -1548,31 +1550,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) @@ -1651,31 +1647,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) @@ -1753,6 +1743,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpq $3, %rax jl .L7_13 + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 @@ -1768,22 +1761,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_12: + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 + prefetcht0 B_PR1+104(BO) KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 dec %rax - //je .L7_12_E - jne .L7_12 .L7_12_E: + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 @@ -1904,31 +1900,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) @@ -2019,31 +2009,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) @@ -2127,31 +2111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) @@ -2277,13 +2255,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -2291,13 +2267,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -2399,13 +2373,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -2413,13 +2385,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -2503,13 +2473,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -2517,13 +2485,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -2600,13 +2566,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -2614,13 +2578,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -2743,7 +2705,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -2756,7 +2717,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -2851,7 +2811,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -2864,7 +2823,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -2946,7 +2904,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) @@ -3036,7 +2993,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) @@ -3049,7 +3005,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) @@ -3317,13 +3272,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -3331,13 +3284,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -3502,13 +3453,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -3516,13 +3465,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -3667,13 +3614,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -3681,13 +3626,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -3818,13 +3761,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -3832,13 +3773,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -4023,7 +3962,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -4036,7 +3974,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -4186,7 +4123,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -4199,7 +4135,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -4335,7 +4270,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) @@ -4476,7 +4410,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) @@ -4489,7 +4422,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) diff --git a/param.h b/param.h index a6990f7c3..c4d15323a 100644 --- a/param.h +++ b/param.h @@ -347,7 +347,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 768 #define ZGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 768 #else From 6da558d2abe339328718ccce7ca7b1b16a8fcae7 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Nov 2013 17:39:13 +0100 Subject: [PATCH 07/15] changes for compatibility with Pathscale compiler --- common_x86.h | 15 +- common_x86_64.h | 10 + kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 260 ++++++++++---------- 3 files changed, 150 insertions(+), 135 deletions(-) diff --git a/common_x86.h b/common_x86.h index 48517d900..8245f7078 100644 --- a/common_x86.h +++ b/common_x86.h @@ -301,12 +301,25 @@ REALNAME: #define PROFCODE #endif + +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + #define EPILOGUE \ - .size REALNAME, .-REALNAME; \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits #endif + + +#endif + #ifdef XDOUBLE #define FLD fldt #define FST fstpt diff --git a/common_x86_64.h b/common_x86_64.h index 188903848..4fe23448f 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -372,10 +372,20 @@ REALNAME: #define PROFCODE #endif +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits +#endif + #endif diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S index cc0ebef8a..8585d45de 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /********************************************************************* * -* 2013/10/31 Saar +* 2013/11/13 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -144,25 +144,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACK_TOUCH #endif -#if defined(BULLDOZER1) +#if defined(BULLDOZER) -.macro VFMADD231PD_ y1,y2,y0 - vfmaddpd \y0,\y1,\y2,\y0 -.endm +#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 -.macro VFMADD231SD_ x1,x2,x0 - vfmaddsd \x0,\x1,\x2,\x0 -.endm +#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 #else -.macro VFMADD231PD_ y1,y2,y0 - vfmadd231pd \y2,\y1,\y0 -.endm +#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 -.macro VFMADD231SD_ x1,x2,x0 - vfmadd231sd \x2,\x1,\x0 -.endm +#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 #endif @@ -218,46 +210,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x3_M1 vmovups -16 * SIZE(AO), %xmm0 prefetcht0 A_PR1(AO) - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -12 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M2 vmovups -8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+64(AO) vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -9 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -8 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm @@ -265,93 +257,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups 0 * SIZE(AO), %xmm0 prefetcht0 A_PR1+128(AO) vmovddup -7 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -6 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -5 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M4 vmovups 8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+192(AO) vmovddup -4 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -3 * SIZE(BO), %xmm1 addq $32 * SIZE, AO - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -2 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M5 vmovups -16 * SIZE(AO), %xmm0 prefetcht0 A_PR1(AO) vmovddup -1 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 0 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 1 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M6 vmovups -8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+64(AO) vmovddup 2 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 3 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 4 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm @@ -359,46 +351,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups 0 * SIZE(AO), %xmm0 prefetcht0 A_PR1+128(AO) vmovddup 5 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 6 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 7 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M8 vmovups 8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+192(AO) vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 9 * SIZE(BO), %xmm1 - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 10 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) vmovddup 11 * SIZE(BO), %xmm3 addq $32 * SIZE, AO addq $24 * SIZE, BO @@ -409,47 +401,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups 8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+192(AO) vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) addq $32*SIZE, AO - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) addq $21*SIZE, BO - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_SUBN vmovddup -12 * SIZE(BO), %xmm1 vmovups -16 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm4 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_ %xmm2,%xmm0,%xmm5 + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_ %xmm3,%xmm0,%xmm6 + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm7 - VFMADD231PD_ %xmm2,%xmm0,%xmm8 - VFMADD231PD_ %xmm3,%xmm0,%xmm9 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm10 - VFMADD231PD_ %xmm2,%xmm0,%xmm11 - VFMADD231PD_ %xmm3,%xmm0,%xmm12 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_ %xmm1,%xmm0,%xmm13 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) addq $3*SIZE, BO - VFMADD231PD_ %xmm2,%xmm0,%xmm14 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) addq $8*SIZE, AO - VFMADD231PD_ %xmm3,%xmm0,%xmm15 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro SAVE8x3 From 6e679266f8c0ec59cae54ffc86520d4e0c9e316f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 13 Nov 2013 18:32:42 +0100 Subject: [PATCH 08/15] changes for compatibility with Pathscale compiler --- common_x86.h | 15 ++++++++++++++- common_x86_64.h | 10 ++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/common_x86.h b/common_x86.h index 48517d900..8245f7078 100644 --- a/common_x86.h +++ b/common_x86.h @@ -301,12 +301,25 @@ REALNAME: #define PROFCODE #endif + +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + #define EPILOGUE \ - .size REALNAME, .-REALNAME; \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits #endif + + +#endif + #ifdef XDOUBLE #define FLD fldt #define FST fstpt diff --git a/common_x86_64.h b/common_x86_64.h index 188903848..4fe23448f 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -372,10 +372,20 @@ REALNAME: #define PROFCODE #endif +#if defined(C_PATHSCALE) || defined(OS_DARWIN) + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + .section .note.GNU-stack,"",@progbits + +#else + #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",%progbits +#endif + #endif From 271ceeba1569e7545f03c9500fa443c302c8b17a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 10:58:22 +0100 Subject: [PATCH 09/15] merged form develop --- common_x86_64.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 2da627848..39e5a5eb1 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -372,20 +372,10 @@ REALNAME: #define PROFCODE #endif -#if defined(C_PATHSCALE) || defined(OS_DARWIN) - #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits -#else - -#define EPILOGUE \ - .size REALNAME, .-REALNAME; \ - .section .note.GNU-stack,"",@progbits - -#endif - #endif From ac7735e01fe93039285c6f26916f1382209a65e5 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 11:05:10 +0100 Subject: [PATCH 10/15] merge from develop --- common_x86.h | 15 +-------------- common_x86_64.h | 10 ---------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/common_x86.h b/common_x86.h index 8245f7078..5f42843be 100644 --- a/common_x86.h +++ b/common_x86.h @@ -301,23 +301,10 @@ REALNAME: #define PROFCODE #endif - -#if defined(C_PATHSCALE) || defined(OS_DARWIN) - #define EPILOGUE \ - .size REALNAME, .-REALNAME; \ + .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits -#else - -#define EPILOGUE \ - .size REALNAME, .-REALNAME; \ - .section .note.GNU-stack,"",%progbits - -#endif - - - #endif #ifdef XDOUBLE diff --git a/common_x86_64.h b/common_x86_64.h index 4fe23448f..39e5a5eb1 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -372,20 +372,10 @@ REALNAME: #define PROFCODE #endif -#if defined(C_PATHSCALE) || defined(OS_DARWIN) - #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits -#else - -#define EPILOGUE \ - .size REALNAME, .-REALNAME; \ - .section .note.GNU-stack,"",%progbits - -#endif - #endif From c947ab85dc5867fdaa2a2c390d4278e59dff909e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 13:46:30 +0100 Subject: [PATCH 11/15] changed level3.c --- driver/level3/level3.c | 4 +--- driver/level3/level3_thread.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index edba5359e..5f746642c 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if ( defined(BULLDOZER) || defined(PILEDRIVER) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 850580504..ee1a8db7c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if ( defined(BULLDOZER) || defined(PILEDRIVER) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; From 39dc69db4a83cb92b5610dc0cc61979f76b4b957 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 13:51:04 +0100 Subject: [PATCH 12/15] changed level3.c --- driver/level3/level3.c | 4 +--- driver/level3/level3_thread.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2fe889527..5f746642c 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3242790fa..ee1a8db7c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N; - else +#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; From 3169f524e417348faa11998cdeec18708e17ccb9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 16:22:49 +0100 Subject: [PATCH 13/15] modified common.h --- common.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/common.h b/common.h index 309f246e2..4e3230d7e 100644 --- a/common.h +++ b/common.h @@ -310,6 +310,15 @@ typedef int blasint; #define YIELDING SwitchToThread() #endif +#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) +#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + +#ifdef PILEDRIVER +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif + + #ifndef YIELDING #define YIELDING sched_yield() #endif @@ -363,6 +372,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips64.h" #endif +#ifdef ARCH_ARM +#include "common_arm.h" +#endif + +#ifdef ARCH_ARM64 +#include "common_arm64.h" +#endif + + #ifdef OS_LINUX #include "common_linux.h" #endif From ec2dadde9b22a1289a5891d530e41167274410ff Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 18:02:11 +0100 Subject: [PATCH 14/15] modified param.h --- param.h | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 240 insertions(+), 14 deletions(-) diff --git a/param.h b/param.h index 0c3df6951..b865287be 100644 --- a/param.h +++ b/param.h @@ -304,9 +304,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef PILEDRIVER - -#define SNUMOPT 8 -#define DNUMOPT 4 +#define SNUMOPT 8 +#define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 @@ -344,39 +343,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMV_UNROLL 8 #endif - #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 768 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 #else #define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 224 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 -#define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 -#define CGEMM_DEFAULT_Q 224 -#define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 -#define SGEMM_DEFAULT_R sgemm_r +#define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r -#define SYMV_P 16 +#define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn @@ -1150,6 +1152,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#endif + + +#endif + #ifdef ATOM @@ -1793,6 +1896,129 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif + +#ifdef ARMV7 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + +#if defined(ARMV6) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + +#if defined(ARMV8) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + + + + #ifdef GENERIC #define SNUMOPT 2 From d2385f0d524e6431374062cdcddeda8b83008e34 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 1 Dec 2013 18:02:54 +0100 Subject: [PATCH 15/15] modified param.h --- param.h | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 227 insertions(+), 5 deletions(-) diff --git a/param.h b/param.h index c4d15323a..b865287be 100644 --- a/param.h +++ b/param.h @@ -304,9 +304,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef PILEDRIVER - -#define SNUMOPT 8 -#define DNUMOPT 4 +#define SNUMOPT 8 +#define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 @@ -344,7 +343,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMV_UNROLL 8 #endif - #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 768 @@ -380,7 +378,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r -#define SYMV_P 16 +#define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn @@ -1154,6 +1152,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#endif + + +#endif + #ifdef ATOM @@ -1797,6 +1896,129 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif + +#ifdef ARMV7 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + +#if defined(ARMV6) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + +#if defined(ARMV8) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + +#define SYMV_P 16 +#endif + + + + #ifdef GENERIC #define SNUMOPT 2