From 493d4fe7e5bfdc5a237df2c6c1fe489904d5bf7d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 16 Aug 2014 11:36:48 +0200 Subject: [PATCH 001/119] added reference in C for symv_L --- kernel/arm/symv_L.c | 70 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 kernel/arm/symv_L.c diff --git a/kernel/arm/symv_L.c b/kernel/arm/symv_L.c new file mode 100644 index 000000000..8f48d03f5 --- /dev/null +++ b/kernel/arm/symv_L.c @@ -0,0 +1,70 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if ( m != offset ) + printf("Symv_L: m=%d offset=%d\n",m,offset); +#endif + + jx = 0; + jy = 0; + + for (j=0; j Date: Sat, 16 Aug 2014 13:52:50 +0200 Subject: [PATCH 002/119] add reference in C for symv_U --- kernel/arm/symv_U.c | 71 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 kernel/arm/symv_U.c diff --git a/kernel/arm/symv_U.c b/kernel/arm/symv_U.c new file mode 100644 index 000000000..b5a0c96e9 --- /dev/null +++ b/kernel/arm/symv_U.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if( m != offset ) + printf("Symv_U: m=%d offset=%d\n",m,offset); +#endif + + BLASLONG m1 = m - offset; + + jx = m1 * inc_x; + jy = m1 * inc_y; + + for (j=m1; j Date: Mon, 18 Aug 2014 12:18:10 +0200 Subject: [PATCH 003/119] added optimized dsymv_U kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 2 + kernel/x86_64/dsymv_U.c | 208 +++++++++++++++++++++ kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 117 ++++++++++++ 3 files changed, 327 insertions(+) create mode 100644 kernel/x86_64/dsymv_U.c create mode 100644 kernel/x86_64/dsymv_U_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 19bf7fd32..03925cc19 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,3 +1,5 @@ +DSYMV_U_KERNEL = dsymv_U.c + SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c new file mode 100644 index 000000000..1f22abe8d --- /dev/null +++ b/kernel/x86_64/dsymv_U.c @@ -0,0 +1,208 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(BULLDOZER) +#include "dsymv_U_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_8x2 + +static void dsymv_kernel_8x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) +{ + FLOAT at0,at1,at2,at3; + FLOAT tmp2[2] = { 0.0, 0.0 }; + FLOAT tp0; + FLOAT tp1; + BLASLONG i; + + tp0 = temp1[0]; + tp1 = temp1[1]; + + for (i=0; i Date: Mon, 18 Aug 2014 13:52:24 +0200 Subject: [PATCH 004/119] added optimized ssymv_U kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/ssymv_U.c | 209 +++++++++++++++++++++ kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 119 ++++++++++++ 3 files changed, 329 insertions(+) create mode 100644 kernel/x86_64/ssymv_U.c create mode 100644 kernel/x86_64/ssymv_U_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 03925cc19..a078528cd 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,4 +1,5 @@ DSYMV_U_KERNEL = dsymv_U.c +SSYMV_U_KERNEL = ssymv_U.c SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c new file mode 100644 index 000000000..75b8e2c3e --- /dev/null +++ b/kernel/x86_64/ssymv_U.c @@ -0,0 +1,209 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) +#include "ssymv_U_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_16x2 + +static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) +{ + FLOAT at0,at1,at2,at3; + FLOAT tmp2[2] = { 0.0, 0.0 }; + FLOAT tp0; + FLOAT tp1; + BLASLONG i; + + tp0 = temp1[0]; + tp1 = temp1[1]; + + for (i=0; i Date: Tue, 19 Aug 2014 17:09:45 +0200 Subject: [PATCH 005/119] added optimized ssymv_U kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 3 + kernel/x86_64/ssymv_U.c | 134 +++++++++++++++++------ kernel/x86_64/ssymv_U_microk_nehalem-2.c | 130 ++++++++++++++++++++++ 3 files changed, 232 insertions(+), 35 deletions(-) create mode 100644 kernel/x86_64/ssymv_U_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index ca9ff252d..353514449 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,3 +1,6 @@ +#DSYMV_U_KERNEL = dsymv_U.c +SSYMV_U_KERNEL = ssymv_U.c + SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 75b8e2c3e..61127aa3d 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -31,41 +31,94 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "ssymv_U_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "ssymv_U_microk_nehalem-2.c" #endif +#ifndef HAVE_KERNEL_4x4 -#ifndef HAVE_KERNEL_16x2 - -static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) { FLOAT at0,at1,at2,at3; - FLOAT tmp2[2] = { 0.0, 0.0 }; + FLOAT x; + FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; FLOAT tp0; FLOAT tp1; + FLOAT tp2; + FLOAT tp3; BLASLONG i; tp0 = temp1[0]; tp1 = temp1[1]; + tp2 = temp1[2]; + tp3 = temp1[3]; - for (i=0; i Date: Tue, 19 Aug 2014 19:25:03 +0200 Subject: [PATCH 006/119] updated optimized ssymv_U for bulldozer --- kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 97 ++++++++++------------ 1 file changed, 46 insertions(+), 51 deletions(-) diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c index b0b0bed65..b8b3b73e9 100644 --- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c @@ -25,10 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#define HAVE_KERNEL_16x2 1 -static void ssymv_kernel_16x2( BLASLONG n, FLOAT *a1, FLOAT *a2, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); -static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) { BLASLONG register i = 0; @@ -37,64 +37,57 @@ static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *x, FLOAT ( "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0] "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1] - "vbroadcastss (%6), %%xmm2 \n\t" // temp1[0] - "vbroadcastss 4(%6), %%xmm3 \n\t" // temp1[1] + "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2] + "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3] + "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0] + "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1] + "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1] + "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1] "xorq %0,%0 \n\t" - ".align 16 \n\t" - ".L01LOOP%=: \n\t" + ".align 16 \n\t" + ".L01LOOP%=: \n\t" - "prefetcht0 192(%4,%0,4) \n\t" - "vmovups (%4,%0,4), %%xmm4 \n\t" // 2 * a0 - "vmovups 16(%4,%0,4), %%xmm5 \n\t" // 2 * a0 - "prefetcht0 192(%2,%0,4) \n\t" - "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x - "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x - "prefetcht0 192(%3,%0,4) \n\t" - "vmovups 32(%4,%0,4), %%xmm6 \n\t" // 2 * a0 - "vmovups 48(%4,%0,4), %%xmm7 \n\t" // 2 * a0 - "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x - "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x + "vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x + "vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y - "prefetcht0 192(%5,%0,4) \n\t" - "vfmaddps (%3,%0,4), %%xmm2 , %%xmm4 , %%xmm12 \n\t" // y += temp1 * a0 - "vfmaddps %%xmm0 , %%xmm8 , %%xmm4 , %%xmm0 \n\t" // temp2 += a0 * x - "vfmaddps 16(%3,%0,4), %%xmm2 , %%xmm5 , %%xmm13 \n\t" // y += temp1 * a0 - "vmovups (%5,%0,4), %%xmm4 \n\t" // 2 * a1 - "vfmaddps %%xmm0 , %%xmm9 , %%xmm5 , %%xmm0 \n\t" // temp2 += a0 * x - "vfmaddps 32(%3,%0,4), %%xmm2 , %%xmm6 , %%xmm14 \n\t" // y += temp1 * a0 - "vmovups 16(%5,%0,4), %%xmm5 \n\t" // 2 * a1 - "vfmaddps %%xmm0 , %%xmm10, %%xmm6 , %%xmm0 \n\t" // temp2 += a0 * x - "vfmaddps 48(%3,%0,4), %%xmm2 , %%xmm7 , %%xmm15 \n\t" // y += temp1 * a0 - "vmovups 32(%5,%0,4), %%xmm6 \n\t" // 2 * a1 - "vfmaddps %%xmm0 , %%xmm11, %%xmm7 , %%xmm0 \n\t" // temp2 += a0 * x - "vmovups 48(%5,%0,4), %%xmm7 \n\t" // 2 * a1 + "vmovups (%4,%0,4), %%xmm12 \n\t" // 4 * a + "vmovups (%5,%0,4), %%xmm13 \n\t" // 4 * a - "vfmaddps %%xmm12, %%xmm3 , %%xmm4 , %%xmm12 \n\t" // y += temp1 * a1 - "vfmaddps %%xmm13, %%xmm3 , %%xmm5 , %%xmm13 \n\t" // y += temp1 * a1 - "vmovups %%xmm12, (%3,%0,4) \n\t" // 2 * y - "vfmaddps %%xmm14, %%xmm3 , %%xmm6 , %%xmm14 \n\t" // y += temp1 * a1 - "vmovups %%xmm13, 16(%3,%0,4) \n\t" // 2 * y - "vfmaddps %%xmm15, %%xmm3 , %%xmm7 , %%xmm15 \n\t" // y += temp1 * a1 - "vmovups %%xmm14, 32(%3,%0,4) \n\t" // 2 * y + "vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + "vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a - "vfmaddps %%xmm1 , %%xmm8 , %%xmm4 , %%xmm1 \n\t" // temp2 += a1 * x - "vfmaddps %%xmm1 , %%xmm9 , %%xmm5 , %%xmm1 \n\t" // temp2 += a1 * x - "vmovups %%xmm15, 48(%3,%0,4) \n\t" // 2 * y - "vfmaddps %%xmm1 , %%xmm10, %%xmm6 , %%xmm1 \n\t" // temp2 += a1 * x - "vfmaddps %%xmm1 , %%xmm11, %%xmm7 , %%xmm1 \n\t" // temp2 += a1 * x + "vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + "vmovups (%6,%0,4), %%xmm14 \n\t" // 4 * a + "vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a - "addq $16, %0 \n\t" - "subq $16, %1 \n\t" + "vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + "vmovups (%7,%0,4), %%xmm15 \n\t" // 4 * a + "vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a + + "vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + "vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a + + "vmovups %%xmm9 , (%3,%0,4) \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" "jnz .L01LOOP%= \n\t" "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" - "vmovss %%xmm0 , (%7) \n\t" // save temp2 - "vmovss %%xmm1 ,4(%7) \n\t" // save temp2 + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "vmovss %%xmm0 , (%9) \n\t" // save temp2 + "vmovss %%xmm1 , 4(%9) \n\t" // save temp2 + "vmovss %%xmm2 , 8(%9) \n\t" // save temp2 + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : : @@ -102,10 +95,12 @@ static void ssymv_kernel_16x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *x, FLOAT "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 - "r" (a0), // 4 - "r" (a1), // 5 - "r" (temp1), // 6 - "r" (temp2) // 7 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 7 + "r" (temp1), // 8 + "r" (temp2) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From ef6374196d0fbb69d0720c973abad9ef39a89253 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 20 Aug 2014 09:00:56 +0200 Subject: [PATCH 007/119] updated optimized dsymv_U kernel for bulldozer --- kernel/x86_64/dsymv_U.c | 135 +++++++++++++++------ kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 115 ++++++++++-------- 2 files changed, 164 insertions(+), 86 deletions(-) diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 1f22abe8d..267755c2f 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -28,43 +28,97 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" + #if defined(BULLDOZER) #include "dsymv_U_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "dsymv_U_microk_nehalem-2.c" #endif +#ifndef HAVE_KERNEL_4x4 -#ifndef HAVE_KERNEL_8x2 - -static void dsymv_kernel_8x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2) { FLOAT at0,at1,at2,at3; - FLOAT tmp2[2] = { 0.0, 0.0 }; + FLOAT x; + FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; FLOAT tp0; FLOAT tp1; + FLOAT tp2; + FLOAT tp3; BLASLONG i; tp0 = temp1[0]; tp1 = temp1[1]; + tp2 = temp1[2]; + tp3 = temp1[3]; - for (i=0; i Date: Wed, 20 Aug 2014 09:58:04 +0200 Subject: [PATCH 008/119] added optimized dsymv_U kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 2 +- kernel/x86_64/dsymv_U_microk_nehalem-2.c | 125 +++++++++++++++++++++++ 2 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dsymv_U_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 353514449..35be648a4 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,4 +1,4 @@ -#DSYMV_U_KERNEL = dsymv_U.c +DSYMV_U_KERNEL = dsymv_U.c SSYMV_U_KERNEL = ssymv_U.c SGEMVNKERNEL = sgemv_n.c diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c new file mode 100644 index 000000000..6aab57500 --- /dev/null +++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c @@ -0,0 +1,125 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0] + "xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1] + "xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2] + "xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3] + "movsd (%8), %%xmm4 \n\t" // temp1[0] + "movsd 8(%8), %%xmm5 \n\t" // temp1[1] + "movsd 16(%8), %%xmm6 \n\t" // temp1[2] + "movsd 24(%8), %%xmm7 \n\t" // temp1[3] + "shufpd $0, %%xmm4, %%xmm4 \n\t" + "shufpd $0, %%xmm5, %%xmm5 \n\t" + "shufpd $0, %%xmm6, %%xmm6 \n\t" + "shufpd $0, %%xmm7, %%xmm7 \n\t" + + "xorq %0,%0 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a + "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x + "movups %%xmm12 , %%xmm11 \n\t" + "movups (%3,%0,8), %%xmm9 \n\t" // 2 * y + "movups (%5,%0,8), %%xmm13 \n\t" // 2 * a + + "mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm12 \n\t" // a * x + "addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + + "movups (%6,%0,8), %%xmm14 \n\t" // 2 * a + "movups (%7,%0,8), %%xmm15 \n\t" // 2 * a + + "movups %%xmm13 , %%xmm11 \n\t" + "mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm13 \n\t" // a * x + "addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + + "movups %%xmm14 , %%xmm11 \n\t" + "mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm14 \n\t" // a * x + "addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + + "addq $2 , %0 \n\t" + "movups %%xmm15 , %%xmm11 \n\t" + "mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm15 \n\t" // a * x + "addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + + "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y + + "subq $2 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "haddpd %%xmm0, %%xmm0 \n\t" + "haddpd %%xmm1, %%xmm1 \n\t" + "haddpd %%xmm2, %%xmm2 \n\t" + "haddpd %%xmm3, %%xmm3 \n\t" + + "movsd %%xmm0 , (%9) \n\t" // save temp2 + "movsd %%xmm1 , 8(%9) \n\t" // save temp2 + "movsd %%xmm2 , 16(%9) \n\t" // save temp2 + "movsd %%xmm3 , 24(%9) \n\t" // save temp2 + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + "r" (a1), // 5 + "r" (a2), // 6 + "r" (a3), // 7 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From f6f9122660ee8be175b61dd91abc1a86e9bb4a7e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Aug 2014 13:02:53 +0200 Subject: [PATCH 009/119] added optimized dsymv_L kernel for bulldozer --- kernel/x86_64/dsymv_L.c | 299 +++++++++++++++++++++ kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 137 ++++++++++ 2 files changed, 436 insertions(+) create mode 100644 kernel/x86_64/dsymv_L.c create mode 100644 kernel/x86_64/dsymv_L_microk_bulldozer-2.c diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c new file mode 100644 index 000000000..29e3aa2f8 --- /dev/null +++ b/kernel/x86_64/dsymv_L.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(BULLDOZER) +#include "dsymv_L_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "dsymv_U_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_4x4 + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2) +{ + FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; + BLASLONG i; + + for (i=from; i=12 ) + { + BLASLONG m2 = (m/4)*4; + for (i=j+1; i j+4 ) + dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2); + + + for (i=m2; i=8 ) + { + BLASLONG j1 = ((from + 4)/4)*4; + BLASLONG j2 = (m/4)*4; + for (i=from; i Date: Thu, 21 Aug 2014 13:32:06 +0200 Subject: [PATCH 010/119] added optimized ssymv_L kernel for bulldozer --- kernel/x86_64/ssymv_L.c | 299 +++++++++++++++++++++ kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 122 +++++++++ 2 files changed, 421 insertions(+) create mode 100644 kernel/x86_64/ssymv_L.c create mode 100644 kernel/x86_64/ssymv_L_microk_bulldozer-2.c diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c new file mode 100644 index 000000000..352f1c862 --- /dev/null +++ b/kernel/x86_64/ssymv_L.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(BULLDOZER) +#include "ssymv_L_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "ssymv_U_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_4x4 + +static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2) +{ + FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; + BLASLONG i; + + for (i=from; i=12 ) + { + BLASLONG m2 = (m/4)*4; + for (i=j+1; i j+4 ) + ssymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2); + + + for (i=m2; i=8 ) + { + BLASLONG j1 = ((from + 4)/4)*4; + BLASLONG j2 = (m/4)*4; + for (i=from; i Date: Thu, 21 Aug 2014 14:27:00 +0200 Subject: [PATCH 011/119] added optimized symv_L kernels for nehalem --- kernel/x86_64/KERNEL.BULLDOZER | 2 + kernel/x86_64/KERNEL.NEHALEM | 2 + kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_L_microk_nehalem-2.c | 132 ++++++++++++++++++++++ kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_L_microk_nehalem-2.c | 137 +++++++++++++++++++++++ 6 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/dsymv_L_microk_nehalem-2.c create mode 100644 kernel/x86_64/ssymv_L_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index a078528cd..3ee1978b8 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,5 +1,7 @@ DSYMV_U_KERNEL = dsymv_U.c +DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c +SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 35be648a4..b16fd9c49 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,5 +1,7 @@ DSYMV_U_KERNEL = dsymv_U.c +DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c +SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 29e3aa2f8..8d1337746 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "dsymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) -#include "dsymv_U_microk_nehalem-2.c" +#include "dsymv_L_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c new file mode 100644 index 000000000..3ba596c5e --- /dev/null +++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __asm__ __volatile__ + ( + "xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0] + "xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1] + "xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2] + "xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3] + "movsd (%8), %%xmm4 \n\t" // temp1[0] + "movsd 8(%8), %%xmm5 \n\t" // temp1[1] + "movsd 16(%8), %%xmm6 \n\t" // temp1[2] + "movsd 24(%8), %%xmm7 \n\t" // temp1[3] + "shufpd $0, %%xmm4, %%xmm4 \n\t" + "shufpd $0, %%xmm5, %%xmm5 \n\t" + "shufpd $0, %%xmm6, %%xmm6 \n\t" + "shufpd $0, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a + "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x + "movups %%xmm12 , %%xmm11 \n\t" + "movups (%3,%0,8), %%xmm9 \n\t" // 2 * y + "movups (%5,%0,8), %%xmm13 \n\t" // 2 * a + + "mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm12 \n\t" // a * x + "addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + + "movups (%6,%0,8), %%xmm14 \n\t" // 2 * a + "movups (%7,%0,8), %%xmm15 \n\t" // 2 * a + + "movups %%xmm13 , %%xmm11 \n\t" + "mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm13 \n\t" // a * x + "addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + + "movups %%xmm14 , %%xmm11 \n\t" + "mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm14 \n\t" // a * x + "addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + + "addq $2 , %0 \n\t" + "movups %%xmm15 , %%xmm11 \n\t" + "mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a + "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulpd %%xmm8 , %%xmm15 \n\t" // a * x + "addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + + "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y + + "cmpq %0 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "movsd (%9), %%xmm4 \n\t" // temp1[0] + "movsd 8(%9), %%xmm5 \n\t" // temp1[1] + "movsd 16(%9), %%xmm6 \n\t" // temp1[2] + "movsd 24(%9), %%xmm7 \n\t" // temp1[3] + + "haddpd %%xmm0, %%xmm0 \n\t" + "haddpd %%xmm1, %%xmm1 \n\t" + "haddpd %%xmm2, %%xmm2 \n\t" + "haddpd %%xmm3, %%xmm3 \n\t" + + "addsd %%xmm4, %%xmm0 \n\t" + "addsd %%xmm5, %%xmm1 \n\t" + "addsd %%xmm6, %%xmm2 \n\t" + "addsd %%xmm7, %%xmm3 \n\t" + + "movsd %%xmm0 , (%9) \n\t" // save temp2 + "movsd %%xmm1 , 8(%9) \n\t" // save temp2 + "movsd %%xmm2 , 16(%9) \n\t" // save temp2 + "movsd %%xmm3 , 24(%9) \n\t" // save temp2 + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 7 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 352f1c862..096adc6ca 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) -#include "ssymv_U_microk_nehalem-2.c" +#include "ssymv_L_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c new file mode 100644 index 000000000..a1c62caf6 --- /dev/null +++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline)); + +static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + + + __asm__ __volatile__ + ( + "xorps %%xmm0 , %%xmm0 \n\t" // temp2[0] + "xorps %%xmm1 , %%xmm1 \n\t" // temp2[1] + "xorps %%xmm2 , %%xmm2 \n\t" // temp2[2] + "xorps %%xmm3 , %%xmm3 \n\t" // temp2[3] + "movss (%8), %%xmm4 \n\t" // temp1[0] + "movss 4(%8), %%xmm5 \n\t" // temp1[1] + "movss 8(%8), %%xmm6 \n\t" // temp1[2] + "movss 12(%8), %%xmm7 \n\t" // temp1[3] + "shufps $0, %%xmm4, %%xmm4 \n\t" + "shufps $0, %%xmm5, %%xmm5 \n\t" + "shufps $0, %%xmm6, %%xmm6 \n\t" + "shufps $0, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x + "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y + + "movups (%4,%0,4), %%xmm12 \n\t" // 4 * a + "movups (%5,%0,4), %%xmm13 \n\t" // 4 * a + + "movups %%xmm12 , %%xmm11 \n\t" + "mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a + "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulps %%xmm8 , %%xmm12 \n\t" // a * x + "addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a + + "movups (%6,%0,4), %%xmm14 \n\t" // 4 * a + "movups (%7,%0,4), %%xmm15 \n\t" // 4 * a + + "movups %%xmm13 , %%xmm11 \n\t" + "mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a + "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulps %%xmm8 , %%xmm13 \n\t" // a * x + "addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a + + "movups %%xmm14 , %%xmm11 \n\t" + "mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a + "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulps %%xmm8 , %%xmm14 \n\t" // a * x + "addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a + + "movups %%xmm15 , %%xmm11 \n\t" + "mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a + "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a + "mulps %%xmm8 , %%xmm15 \n\t" // a * x + "addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a + + "movups %%xmm9, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "cmpq %0 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "movss (%9), %%xmm4 \n\t" // temp1[0] + "movss 4(%9), %%xmm5 \n\t" // temp1[1] + "movss 8(%9), %%xmm6 \n\t" // temp1[2] + "movss 12(%9), %%xmm7 \n\t" // temp1[3] + + "haddps %%xmm0, %%xmm0 \n\t" + "haddps %%xmm1, %%xmm1 \n\t" + "haddps %%xmm2, %%xmm2 \n\t" + "haddps %%xmm3, %%xmm3 \n\t" + "haddps %%xmm0, %%xmm0 \n\t" + "haddps %%xmm1, %%xmm1 \n\t" + "haddps %%xmm2, %%xmm2 \n\t" + "haddps %%xmm3, %%xmm3 \n\t" + + "addss %%xmm4, %%xmm0 \n\t" + "addss %%xmm5, %%xmm1 \n\t" + "addss %%xmm6, %%xmm2 \n\t" + "addss %%xmm7, %%xmm3 \n\t" + + "movss %%xmm0 , (%9) \n\t" // save temp2 + "movss %%xmm1 , 4(%9) \n\t" // save temp2 + "movss %%xmm2 , 8(%9) \n\t" // save temp2 + "movss %%xmm3 , 12(%9) \n\t" // save temp2 + + : + : + "r" (from), // 0 + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (a[0]), // 4 + "r" (a[1]), // 5 + "r" (a[2]), // 6 + "r" (a[3]), // 7 + "r" (temp1), // 8 + "r" (temp2) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 6f73ffc114a6d56b85c3023f6e8f7856e4198aec Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 21 Aug 2014 19:33:57 +0200 Subject: [PATCH 012/119] added benchmarks for csymv and zsymv --- benchmark/Makefile | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index de94dcc59..3b2d263c8 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -35,7 +35,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto \ - ssymv.goto dsymv.goto \ + ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ @@ -53,7 +53,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml \ - ssymv.acml dsymv.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ @@ -71,7 +71,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas \ - ssymv.atlas dsymv.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ @@ -90,7 +90,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl \ - ssymv.mkl dsymv.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ @@ -732,6 +732,32 @@ dsymv.atlas : dsymv.$(SUFFIX) dsymv.mkl : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Csymv #################################################### +csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +csymv.acml : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.atlas : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.mkl : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zsymv.acml : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.atlas : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.mkl : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm @@ -1037,6 +1063,12 @@ ssymv.$(SUFFIX) : symv.c dsymv.$(SUFFIX) : symv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +csymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sgeev.$(SUFFIX) : geev.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ From fb0b4552a5dd5e83434f159aaf4d44b8a0377b49 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 10:00:09 +0200 Subject: [PATCH 013/119] added hemv benchmark --- benchmark/Makefile | 38 +++++++++ benchmark/hemv.c | 208 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 benchmark/hemv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 3b2d263c8..6c364ddfe 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -36,6 +36,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ + chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ @@ -54,6 +55,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ @@ -72,6 +74,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ @@ -91,6 +94,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ @@ -922,6 +926,33 @@ zpotrf.atlas : zpotrf.$(SUFFIX) zpotrf.mkl : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chemv #################################################### + +chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +chemv.acml : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.atlas : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.mkl : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemv #################################################### + +zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zhemv.acml : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.atlas : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.mkl : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ################################################################################################### @@ -1105,6 +1136,13 @@ cpotrf.$(SUFFIX) : potrf.c zpotrf.$(SUFFIX) : potrf.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +chemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + diff --git a/benchmark/hemv.c b/benchmark/hemv.c new file mode 100644 index 000000000..79b7679cc --- /dev/null +++ b/benchmark/hemv.c @@ -0,0 +1,208 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HEMV + + +#ifdef DOUBLE +#define HEMV BLASFUNC(zhemv) +#else +#define HEMV BLASFUNC(chemv) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int MAIN__(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m,(int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + + for (l=0; l Date: Fri, 22 Aug 2014 11:42:07 +0200 Subject: [PATCH 014/119] added sdot and ddot benchmarks --- benchmark/Makefile | 37 +++++++++ benchmark/dot.c | 195 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 benchmark/dot.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 6c364ddfe..1bc631aa4 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -35,6 +35,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto \ + sdot.goto ddot.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ @@ -54,6 +55,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml \ + sdot.acml ddot.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ @@ -73,6 +75,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas \ + sdot.atlas ddot.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ @@ -93,6 +96,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl \ + sdot.mkl ddot.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ @@ -954,6 +958,33 @@ zhemv.atlas : zhemv.$(SUFFIX) zhemv.mkl : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +sdot.acml : sdot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.atlas : sdot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.mkl : sdot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +ddot.acml : ddot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.atlas : ddot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.mkl : ddot.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ################################################################################################### slinpack.$(SUFFIX) : linpack.c @@ -1142,6 +1173,12 @@ chemv.$(SUFFIX) : hemv.c zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + diff --git a/benchmark/dot.c b/benchmark/dot.c new file mode 100644 index 000000000..6132ed324 --- /dev/null +++ b/benchmark/dot.c @@ -0,0 +1,195 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef DOT + + +#ifdef DOUBLE +#define DOT BLASFUNC(ddot) +#else +#define DOT BLASFUNC(sdot) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int MAIN__(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT result; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Fri, 22 Aug 2014 11:51:30 +0200 Subject: [PATCH 015/119] bugfix in Makefile --- benchmark/Makefile | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 1bc631aa4..679a98c32 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -960,29 +960,29 @@ zhemv.mkl : zhemv.$(SUFFIX) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm sdot.acml : sdot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sdot.atlas : sdot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sdot.mkl : sdot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ddot #################################################### ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm ddot.acml : ddot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ddot.atlas : ddot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ddot.mkl : ddot.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ################################################################################################### @@ -1174,10 +1174,10 @@ zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ From 8a9e868919c50c407057f1e79b5a4c94a24e54e8 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 14:29:17 +0200 Subject: [PATCH 016/119] added optimized sdot for bulldozer --- kernel/x86_64/sdot.c | 107 ++++++++++++++++++++++++ kernel/x86_64/sdot_microk_bulldozer-2.c | 85 +++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 kernel/x86_64/sdot.c create mode 100644 kernel/x86_64/sdot_microk_bulldozer-2.c diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c new file mode 100644 index 000000000..8c60b954a --- /dev/null +++ b/kernel/x86_64/sdot.c @@ -0,0 +1,107 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "sdot_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + *d += dot; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -16; + + if ( n1 ) + sdot_kernel_16(n1, x, y , &dot ); + + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c new file mode 100644 index 000000000..024b2ce6d --- /dev/null +++ b/kernel/x86_64/sdot_microk_bulldozer-2.c @@ -0,0 +1,85 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + + "vfmaddps %%xmm4, (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y + "vfmaddps %%xmm5, 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y + "vfmaddps %%xmm6, 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y + "vfmaddps %%xmm7, 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovss %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 5d97b0754c3c1ef975365410b37d9cda360b73eb Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 17:00:26 +0200 Subject: [PATCH 017/119] added optimized sdot kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 2 + kernel/x86_64/sdot.c | 2 + kernel/x86_64/sdot_microk_nehalem-2.c | 94 +++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 kernel/x86_64/sdot_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index b16fd9c49..aee622d0d 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,3 +1,5 @@ +SDOTKERNEL = sdot.c + DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 8c60b954a..a13d65d25 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) #include "sdot_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "sdot_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c new file mode 100644 index 000000000..3548ace88 --- /dev/null +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorps %%xmm4, %%xmm4 \n\t" + "xorps %%xmm5, %%xmm5 \n\t" + "xorps %%xmm6, %%xmm6 \n\t" + "xorps %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x + "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * x + "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * x + "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * x + + "mulps %%xmm8 , %%xmm12 \n\t" + "mulps %%xmm9 , %%xmm13 \n\t" + "mulps %%xmm10, %%xmm14 \n\t" + "mulps %%xmm11, %%xmm15 \n\t" + + "addps %%xmm12, %%xmm4 \n\t" + "addps %%xmm13, %%xmm5 \n\t" + "addps %%xmm14, %%xmm6 \n\t" + "addps %%xmm15, %%xmm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "addps %%xmm5, %%xmm4 \n\t" + "addps %%xmm7, %%xmm6 \n\t" + "addps %%xmm6, %%xmm4 \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm4, %%xmm4 \n\t" + + "movss %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 95a707ced3a332221d7539e51e0051a9d07edd91 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 17:01:27 +0200 Subject: [PATCH 018/119] update of KERNEL.BULLDOZER --- kernel/x86_64/KERNEL.BULLDOZER | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 3ee1978b8..7aa597ea0 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,3 +1,5 @@ +SDOTKERNEL = sdot.c + DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c From 53ec5789e2aeeb4bc5eee2de65047ec3a8169c8a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 17:02:55 +0200 Subject: [PATCH 019/119] bugfix for Makefile --- benchmark/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 679a98c32..933b4bd44 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -108,7 +108,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl -all :: goto atlas acml mkl +all :: goto mkl atlas acml ##################################### Slinpack #################################################### slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) From 16d6be852dbcb09365fd61e1e11b3cac66901b76 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 20:34:41 +0200 Subject: [PATCH 020/119] added optimized ddot kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 1 + kernel/x86_64/ddot.c | 110 ++++++++++++++++++++++++++ kernel/x86_64/ddot_microk_nehalem-2.c | 94 ++++++++++++++++++++++ kernel/x86_64/sdot_microk_nehalem-2.c | 4 +- 4 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/ddot.c create mode 100644 kernel/x86_64/ddot_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index aee622d0d..e0f3c3336 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,4 +1,5 @@ SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c new file mode 100644 index 000000000..ee6785f6a --- /dev/null +++ b/kernel/x86_64/ddot.c @@ -0,0 +1,110 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "ddot_microk_bulldozer-2.c" +#elif defined(NEHALEM) +#include "ddot_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + *d += dot; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -8; + + if ( n1 ) + ddot_kernel_8(n1, x, y , &dot ); + + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c new file mode 100644 index 000000000..dd05053f7 --- /dev/null +++ b/kernel/x86_64/ddot_microk_nehalem-2.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorpd %%xmm4, %%xmm4 \n\t" + "xorpd %%xmm5, %%xmm5 \n\t" + "xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y + "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y + "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y + "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y + + "mulpd %%xmm8 , %%xmm12 \n\t" + "mulpd %%xmm9 , %%xmm13 \n\t" + "mulpd %%xmm10, %%xmm14 \n\t" + "mulpd %%xmm11, %%xmm15 \n\t" + + "addpd %%xmm12, %%xmm4 \n\t" + "addpd %%xmm13, %%xmm5 \n\t" + "addpd %%xmm14, %%xmm6 \n\t" + "addpd %%xmm15, %%xmm7 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "addpd %%xmm5, %%xmm4 \n\t" + "addpd %%xmm7, %%xmm6 \n\t" + "addpd %%xmm6, %%xmm4 \n\t" + + "haddpd %%xmm4, %%xmm4 \n\t" + + "movsd %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c index 3548ace88..2a918b5ea 100644 --- a/kernel/x86_64/sdot_microk_nehalem-2.c +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -83,8 +83,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "r" (y), // 3 "r" (dot) // 4 : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); From f1b96c4846f9fd110b65335600445fae50fb1d26 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 22 Aug 2014 21:19:29 +0200 Subject: [PATCH 021/119] added optimized ddot kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/ddot_microk_bulldozer-2.c | 84 +++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 kernel/x86_64/ddot_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 7aa597ea0..9e6fc2e28 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,4 +1,5 @@ SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c new file mode 100644 index 000000000..0c77b6349 --- /dev/null +++ b/kernel/x86_64/ddot_microk_bulldozer-2.c @@ -0,0 +1,84 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + + "vfmaddpd %%xmm4, (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y + "vfmaddpd %%xmm5, 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y + "vfmaddpd %%xmm6, 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y + "vfmaddpd %%xmm7, 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" + "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" + "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" + + "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" + + "vmovsd %%xmm4, (%4) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (dot) // 4 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From ac76b6267f1d7de2cbbc0b7fb22c825281c36b44 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Aug 2014 10:40:57 +0200 Subject: [PATCH 022/119] added optimized dgemv_n kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 1 + kernel/x86_64/dgemv_n.c | 2 + kernel/x86_64/dgemv_n_microk_nehalem-2.c | 137 +++++++++++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 kernel/x86_64/dgemv_n_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index e0f3c3336..5523b4571 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -8,6 +8,7 @@ SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c +DGEMVNKERNEL = dgemv_n.c SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMINCOPY = gemm_ncopy_4.S diff --git a/kernel/x86_64/dgemv_n.c b/kernel/x86_64/dgemv_n.c index 5d826dc63..cecb8d3fc 100644 --- a/kernel/x86_64/dgemv_n.c +++ b/kernel/x86_64/dgemv_n.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "dgemv_n_microk_haswell-2.c" +#elif defined(NEHALEM) +#include "dgemv_n_microk_nehalem-2.c" #endif diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-2.c b/kernel/x86_64/dgemv_n_microk_nehalem-2.c new file mode 100644 index 000000000..84b82f805 --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_nehalem-2.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16x4 1 +static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movddup (%2), %%xmm12 \n\t" // x0 + "movddup 8(%2), %%xmm13 \n\t" // x1 + "movddup 16(%2), %%xmm14 \n\t" // x2 + "movddup 24(%2), %%xmm15 \n\t" // x3 + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%3,%0,8) \n\t" + "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y + "movups 32(%3,%0,8), %%xmm6 \n\t" // 2 * y + "movups 48(%3,%0,8), %%xmm7 \n\t" // 2 * y + "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a + "movups 32(%4,%0,8), %%xmm10 \n\t" // 2 * a + "movups 48(%4,%0,8), %%xmm11 \n\t" // 2 * a + + "prefetcht0 192(%4,%0,8) \n\t" + "mulpd %%xmm12 , %%xmm8 \n\t" // a * x + "mulpd %%xmm12 , %%xmm9 \n\t" // a * x + "mulpd %%xmm12 , %%xmm10 \n\t" // a * x + "mulpd %%xmm12 , %%xmm11 \n\t" // a * x + "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x + "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x + "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x + "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x + + "prefetcht0 192(%5,%0,8) \n\t" + "movups (%5,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%5,%0,8), %%xmm9 \n\t" // 2 * a + "movups 32(%5,%0,8), %%xmm10 \n\t" // 2 * a + "movups 48(%5,%0,8), %%xmm11 \n\t" // 2 * a + "mulpd %%xmm13 , %%xmm8 \n\t" // a * x + "mulpd %%xmm13 , %%xmm9 \n\t" // a * x + "mulpd %%xmm13 , %%xmm10 \n\t" // a * x + "mulpd %%xmm13 , %%xmm11 \n\t" // a * x + "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x + "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x + "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x + "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x + + "prefetcht0 192(%6,%0,8) \n\t" + "movups (%6,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%6,%0,8), %%xmm9 \n\t" // 2 * a + "movups 32(%6,%0,8), %%xmm10 \n\t" // 2 * a + "movups 48(%6,%0,8), %%xmm11 \n\t" // 2 * a + "mulpd %%xmm14 , %%xmm8 \n\t" // a * x + "mulpd %%xmm14 , %%xmm9 \n\t" // a * x + "mulpd %%xmm14 , %%xmm10 \n\t" // a * x + "mulpd %%xmm14 , %%xmm11 \n\t" // a * x + "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x + "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x + "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x + "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x + + "prefetcht0 192(%7,%0,8) \n\t" + "movups (%7,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%7,%0,8), %%xmm9 \n\t" // 2 * a + "movups 32(%7,%0,8), %%xmm10 \n\t" // 2 * a + "movups 48(%7,%0,8), %%xmm11 \n\t" // 2 * a + "mulpd %%xmm15 , %%xmm8 \n\t" // a * x + "mulpd %%xmm15 , %%xmm9 \n\t" // a * x + "mulpd %%xmm15 , %%xmm10 \n\t" // a * x + "mulpd %%xmm15 , %%xmm11 \n\t" // a * x + "addpd %%xmm8 , %%xmm4 \n\t" // y += a * x + "addpd %%xmm9 , %%xmm5 \n\t" // y += a * x + "addpd %%xmm10 , %%xmm6 \n\t" // y += a * x + "addpd %%xmm11 , %%xmm7 \n\t" // y += a * x + + "movups %%xmm4, (%3,%0,8) \n\t" // 4 * y + "movups %%xmm5, 16(%3,%0,8) \n\t" // 4 * y + "movups %%xmm6, 32(%3,%0,8) \n\t" // 4 * y + "movups %%xmm7, 48(%3,%0,8) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 55e81da379ebf839b28a100f194e03589959ab91 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Aug 2014 13:12:44 +0200 Subject: [PATCH 023/119] added axpy benchmark-test --- benchmark/Makefile | 68 +++++++++++++++ benchmark/axpy.c | 201 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+) create mode 100644 benchmark/axpy.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 933b4bd44..07bf5a792 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -36,6 +36,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto \ sdot.goto ddot.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ @@ -56,6 +57,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml \ sdot.acml ddot.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ @@ -76,6 +78,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas \ sdot.atlas ddot.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ @@ -97,6 +100,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl \ sdot.mkl ddot.mkl \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ @@ -984,6 +988,61 @@ ddot.atlas : ddot.$(SUFFIX) ddot.mkl : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +saxpy.acml : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.atlas : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.mkl : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +daxpy.acml : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.atlas : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.mkl : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +caxpy.acml : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.atlas : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.mkl : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zaxpy.acml : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.atlas : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.mkl : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ################################################################################################### @@ -1179,8 +1238,17 @@ sdot.$(SUFFIX) : dot.c ddot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ diff --git a/benchmark/axpy.c b/benchmark/axpy.c new file mode 100644 index 000000000..ef3b5ae4f --- /dev/null +++ b/benchmark/axpy.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef AXPY + +#ifdef COMPLEX +#ifdef DOUBLE +#define AXPY BLASFUNC(zaxpy) +#else +#define AXPY BLASFUNC(caxpy) +#endif +#else +#ifdef DOUBLE +#define AXPY BLASFUNC(daxpy) +#else +#define AXPY BLASFUNC(saxpy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int MAIN__(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Sat, 23 Aug 2014 17:15:21 +0200 Subject: [PATCH 024/119] added optimized saxpy kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 2 + kernel/x86_64/saxpy.c | 103 +++++++++++++++++++++++++ kernel/x86_64/saxpy_microk_nehalem-2.c | 91 ++++++++++++++++++++++ 3 files changed, 196 insertions(+) create mode 100644 kernel/x86_64/saxpy.c create mode 100644 kernel/x86_64/saxpy_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 5523b4571..3cbaa471f 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,3 +1,5 @@ +SAXPYKERNEL = saxpy.c + SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c new file mode 100644 index 000000000..e6c016ee3 --- /dev/null +++ b/kernel/x86_64/saxpy.c @@ -0,0 +1,103 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(NEHALEM) +#include "saxpy_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + FLOAT a = *alpha; + + while(i < n) + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -16; + + if ( n1 ) + saxpy_kernel_16(n1, x, y , &da ); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c new file mode 100644 index 000000000..14ff51a0d --- /dev/null +++ b/kernel/x86_64/saxpy_microk_nehalem-2.c @@ -0,0 +1,91 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movss (%4), %%xmm0 \n\t" // alpha + "shufps $0, %%xmm0, %%xmm0 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + // "prefetcht0 192(%2,%0,4) \n\t" + // "prefetcht0 192(%3,%0,4) \n\t" + + "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + "movups (%3,%0,4), %%xmm8 \n\t" // 4 * y + "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y + "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y + "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y + + "mulps %%xmm0 , %%xmm12 \n\t" // alpha * x + "mulps %%xmm0 , %%xmm13 \n\t" + "mulps %%xmm0 , %%xmm14 \n\t" + "mulps %%xmm0 , %%xmm15 \n\t" + + "addps %%xmm12, %%xmm8 \n\t" // y += alpha *x + "addps %%xmm13, %%xmm9 \n\t" + "addps %%xmm14, %%xmm10 \n\t" + "addps %%xmm15, %%xmm11 \n\t" + + "movups %%xmm8 , (%3,%0,4) \n\t" + "movups %%xmm9 , 16(%3,%0,4) \n\t" + "movups %%xmm10, 32(%3,%0,4) \n\t" + "movups %%xmm11, 48(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 29125864b30e730f7c81abf98818fb3f5734a9c4 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Aug 2014 17:28:01 +0200 Subject: [PATCH 025/119] updated gemm.c --- benchmark/gemm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmark/gemm.c b/benchmark/gemm.c index fc482c075..4f9a58825 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -142,7 +142,9 @@ int MAIN__(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} - fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + + fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); From b55f99730231d7d33eb3bf0ab49a55c5f22c92d0 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 23 Aug 2014 17:53:07 +0200 Subject: [PATCH 026/119] added optimized daxpy kernel for nehalem --- kernel/x86_64/KERNEL.NEHALEM | 1 + kernel/x86_64/daxpy.c | 103 +++++++++++++++++++++++++ kernel/x86_64/daxpy_microk_nehalem-2.c | 91 ++++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 kernel/x86_64/daxpy.c create mode 100644 kernel/x86_64/daxpy_microk_nehalem-2.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 3cbaa471f..8adb579cf 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -1,4 +1,5 @@ SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c new file mode 100644 index 000000000..feda045d0 --- /dev/null +++ b/kernel/x86_64/daxpy.c @@ -0,0 +1,103 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(NEHALEM) +#include "daxpy_microk_nehalem-2.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + FLOAT a = *alpha; + + while(i < n) + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -8; + + if ( n1 ) + daxpy_kernel_8(n1, x, y , &da ); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c new file mode 100644 index 000000000..32ed1857c --- /dev/null +++ b/kernel/x86_64/daxpy_microk_nehalem-2.c @@ -0,0 +1,91 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%4), %%xmm0 \n\t" // alpha + "shufpd $0, %%xmm0, %%xmm0 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + // "prefetcht0 192(%2,%0,8) \n\t" + // "prefetcht0 192(%3,%0,8) \n\t" + + "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y + "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y + "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y + + "mulpd %%xmm0 , %%xmm12 \n\t" // alpha * x + "mulpd %%xmm0 , %%xmm13 \n\t" + "mulpd %%xmm0 , %%xmm14 \n\t" + "mulpd %%xmm0 , %%xmm15 \n\t" + + "addpd %%xmm12, %%xmm8 \n\t" // y += alpha *x + "addpd %%xmm13, %%xmm9 \n\t" + "addpd %%xmm14, %%xmm10 \n\t" + "addpd %%xmm15, %%xmm11 \n\t" + + "movups %%xmm8 , (%3,%0,8) \n\t" + "movups %%xmm9 , 16(%3,%0,8) \n\t" + "movups %%xmm10, 32(%3,%0,8) \n\t" + "movups %%xmm11, 48(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 9d2ace8bac646d423528b307528043446a9c98ee Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 24 Aug 2014 10:57:12 +0200 Subject: [PATCH 027/119] added optimized daxpy kernel for bulldozer --- kernel/x86_64/daxpy.c | 2 + kernel/x86_64/daxpy_microk_bulldozer-2.c | 82 ++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 kernel/x86_64/daxpy_microk_bulldozer-2.c diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index feda045d0..83754cbd3 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "daxpy_microk_nehalem-2.c" +#elif defined(BULLDOZER) +#include "daxpy_microk_bulldozer-2.c" #endif diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c new file mode 100644 index 000000000..b1ef84a18 --- /dev/null +++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // alpha + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 768(%3,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x + "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm12, %%xmm8 \n\t" // y += alpha * x + "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x + ".align 2 \n\t" + "vmovups %%xmm8 , (%3,%0,8) \n\t" + "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm13, %%xmm9 \n\t" // y += alpha * x + ".align 2 \n\t" + "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x + "vmovups %%xmm9 , 16(%3,%0,8) \n\t" + "prefetcht0 768(%2,%0,8) \n\t" + ".align 2 \n\t" + "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm14, %%xmm10 \n\t" // y += alpha * x + "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x + "vmovups %%xmm10, 32(%3,%0,8) \n\t" + "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm15, %%xmm11 \n\t" // y += alpha * x + "vmovups %%xmm11, 48(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From ee744451559d7e41c8bab16d2bd82f6f2ab8c103 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 25 Aug 2014 14:53:28 +0200 Subject: [PATCH 028/119] added optimized caxpy kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 3 + kernel/x86_64/caxpy.c | 131 ++++++++++++++++++++++ kernel/x86_64/caxpy_microk_bulldozer-2.c | 135 +++++++++++++++++++++++ kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sdot.c | 2 +- 7 files changed, 273 insertions(+), 4 deletions(-) create mode 100644 kernel/x86_64/caxpy.c create mode 100644 kernel/x86_64/caxpy_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 9e6fc2e28..701eea310 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,3 +1,6 @@ +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c + SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c new file mode 100644 index 000000000..fa8924ae9 --- /dev/null +++ b/kernel/x86_64/caxpy.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) +#include "caxpy_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT da[2]; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -8; + + if ( n1 ) + { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_8(n1, x, y , &da ); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c new file mode 100644 index 000000000..86407028c --- /dev/null +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha + "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 768(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x + "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x + "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x + "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x + "prefetcht0 768(%3,%0,4) \n\t" + +#if !defined(CONJ) + "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t" + + "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t" + "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t" + + "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t" + + "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t" + "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + "vmulps %%xmm1, %%xmm10, %%xmm10 \n\t" + + "vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t" + "vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t" + +#else + + "vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i + "vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i + "vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i + "vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i + "vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i + "vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i + "vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i + "vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i + + "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + "vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t" + "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + + "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + "vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t" + "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + "vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + "vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + + "vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t" + "vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t" + "vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t" + "vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t" + + +#endif + + "vmovups %%xmm12, (%3,%0,4) \n\t" + "vmovups %%xmm13, 16(%3,%0,4) \n\t" + "vmovups %%xmm14, 32(%3,%0,4) \n\t" + "vmovups %%xmm15, 48(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 83754cbd3..f1d50c909 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_8 -void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index ee6785f6a..b3aad438f 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_8 -void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { BLASLONG register i = 0; FLOAT dot = 0.0; diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index e6c016ee3..da81f1354 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_16 -void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index a13d65d25..632d16810 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_16 -void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { BLASLONG register i = 0; FLOAT dot = 0.0; From 3885eebdb82787b452b532e6cb7a23d9711a514e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 25 Aug 2014 15:52:35 +0200 Subject: [PATCH 029/119] added optimized zaxpy bulldozer kernel --- kernel/x86_64/KERNEL.BULLDOZER | 1 + kernel/x86_64/zaxpy.c | 131 ++++++++++++++++++++++ kernel/x86_64/zaxpy_microk_bulldozer-2.c | 135 +++++++++++++++++++++++ 3 files changed, 267 insertions(+) create mode 100644 kernel/x86_64/zaxpy.c create mode 100644 kernel/x86_64/zaxpy_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 701eea310..6318b202c 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,5 +1,6 @@ DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c new file mode 100644 index 000000000..ca2f03dd0 --- /dev/null +++ b/kernel/x86_64/zaxpy.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) +#include "zaxpy_microk_bulldozer-2.c" +#endif + + +#ifndef HAVE_KERNEL_4 + +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT da[2]; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + int n1 = n & -4; + + if ( n1 ) + { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_4(n1, x, y , &da ); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c new file mode 100644 index 000000000..780109b69 --- /dev/null +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vmovddup (%4), %%xmm0 \n\t" // real part of alpha + "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 768(%2,%0,8) \n\t" + "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x + "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x + "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x + "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x + "prefetcht0 768(%3,%0,8) \n\t" + +#if !defined(CONJ) + "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part + "vmulpd %%xmm1, %%xmm4 , %%xmm4 \n\t" + + "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part + "vmulpd %%xmm1, %%xmm6 , %%xmm6 \n\t" + + "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part + "vmulpd %%xmm1, %%xmm8 , %%xmm8 \n\t" + + "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm15 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part + "vmulpd %%xmm1, %%xmm10, %%xmm10 \n\t" + + "vaddsubpd %%xmm4, %%xmm12, %%xmm12 \n\t" + "vaddsubpd %%xmm6, %%xmm13, %%xmm13 \n\t" + "vaddsubpd %%xmm8, %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm10,%%xmm15, %%xmm15 \n\t" + +#else + + "vmulpd %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i + "vmulpd %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i + "vmulpd %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i + "vmulpd %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i + "vmulpd %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i + "vmulpd %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i + "vmulpd %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i + "vmulpd %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i + + "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + "vaddsubpd %%xmm4 ,%%xmm5 , %%xmm4 \n\t" + "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part + + "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + "vaddsubpd %%xmm6 ,%%xmm7 , %%xmm6 \n\t" + "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part + + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + "vaddsubpd %%xmm8 ,%%xmm9 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part + + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + "vaddsubpd %%xmm10,%%xmm11, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part + + "vaddpd (%3,%0,8) ,%%xmm4 , %%xmm12 \n\t" + "vaddpd 16(%3,%0,8) ,%%xmm6 , %%xmm13 \n\t" + "vaddpd 32(%3,%0,8) ,%%xmm8 , %%xmm14 \n\t" + "vaddpd 48(%3,%0,8) ,%%xmm10, %%xmm15 \n\t" + + +#endif + + "vmovups %%xmm12, (%3,%0,8) \n\t" + "vmovups %%xmm13, 16(%3,%0,8) \n\t" + "vmovups %%xmm14, 32(%3,%0,8) \n\t" + "vmovups %%xmm15, 48(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (alpha) // 4 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From c8cc4a0d22efc56d46f38417f7bd5035f038d571 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 26 Aug 2014 16:14:34 +0800 Subject: [PATCH 030/119] Fixed the typo in Changelog.txt --- Changelog.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index d33cffc7b..4fd849048 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,6 +1,6 @@ OpenBLAS ChangeLog ==================================================================== -Version 0.2.10 +Version 0.2.11 18-Aug-2014 common: * Added some benchmark codes. From 84badf80866d300eecdb62f49689df1b6ca2ccd7 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 26 Aug 2014 17:36:32 +0200 Subject: [PATCH 031/119] EXPERIMENTAL: added the flag -no-integrated-as for clang compiler in Makefile.system --- Makefile.system | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index ccde8e9ce..385ad47c3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -339,7 +339,7 @@ FCOMMON_OPT += -m128bit-long-double endif ifeq ($(C_COMPILER), CLANG) EXPRECISION = 1 -CCOMMON_OPT += -DEXPRECISION +CCOMMON_OPT += -DEXPRECISION FCOMMON_OPT += -m128bit-long-double endif endif @@ -350,6 +350,12 @@ ifeq ($(C_COMPILER), INTEL) CCOMMON_OPT += -wd981 endif +ifeq ($(ARCH), x86_64) +ifeq ($(C_COMPILER), CLANG) +CCOMMON_OPT += -no-integrated-as +endif +endif + ifeq ($(USE_OPENMP), 1) # ifeq logical or. GCC or LSB ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) From 5fa61587312bd6221fe4050233b7135f3cc73a3f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 26 Aug 2014 18:29:40 +0200 Subject: [PATCH 032/119] renoved flag no-integrated-as, because not working on macosx --- Makefile.system | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 385ad47c3..d2ff74146 100644 --- a/Makefile.system +++ b/Makefile.system @@ -350,11 +350,6 @@ ifeq ($(C_COMPILER), INTEL) CCOMMON_OPT += -wd981 endif -ifeq ($(ARCH), x86_64) -ifeq ($(C_COMPILER), CLANG) -CCOMMON_OPT += -no-integrated-as -endif -endif ifeq ($(USE_OPENMP), 1) # ifeq logical or. GCC or LSB From 20cd85012509993f2e1ed9b9ab83ae63297e0e6d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 27 Aug 2014 09:00:20 +0200 Subject: [PATCH 033/119] modification for clang compiler --- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 16 ++++++++-------- kernel/x86_64/sgemm_kernel_16x4_haswell.S | 20 ++++++++++---------- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 12 ++++++------ 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index 98f40054e..a608071db 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -227,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 ) - addq $6*SIZE, BO - addq $16*SIZE, AO + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO decq %rax .endm @@ -356,8 +356,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) - addq $6*SIZE, BO - addq $8*SIZE, AO + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO decq %rax .endm @@ -447,8 +447,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $4*SIZE, AO + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO decq %rax .endm @@ -540,8 +540,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $2*SIZE, AO + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO decq %rax .endm diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index d88add02b..ef156fd27 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -181,8 +181,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) - addq $6*SIZE, BO - addq $16*SIZE, AO + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO decq %rax .endm @@ -268,8 +268,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - addq $6*SIZE, BO - addq $8*SIZE, AO + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO decq %rax .endm @@ -327,8 +327,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $4*SIZE, AO + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO decq %rax .endm @@ -392,8 +392,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) - addq $6*SIZE, BO - addq $2*SIZE, AO + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO decq %rax .endm @@ -478,8 +478,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $1*SIZE, AO + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO decq %rax .endm diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index e23e09ecc..f91bfa89b 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -222,8 +222,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) - addq $6*SIZE, BO - addq $8*SIZE, AO + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO decq %rax .endm @@ -362,8 +362,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) - addq $6*SIZE, BO - addq $4*SIZE, AO + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO decq %rax .endm @@ -491,8 +491,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - addq $6*SIZE, BO - addq $2*SIZE, AO + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO decq %rax .endm From db7e6366cd86b57cd0712293968938058288bde0 Mon Sep 17 00:00:00 2001 From: Isaac Dunham Date: Thu, 28 Aug 2014 13:05:07 -0700 Subject: [PATCH 034/119] Workaround PIC limitations in cpuid. cpuid uses register ebx, but ebx is reserved in PIC. So save ebx, swap ebx & edi, and return edi. Copied from Igor Pavlov's equivalent fix for 7zip (in CpuArch.c), which is public domain and thus OK license-wise. --- cpuid_x86.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 53016e1e7..f9df7221b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -59,9 +59,16 @@ void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); #else static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ +#if defined(__i386__) && defined(__PIC__) + __asm__ __volatile__ + ("mov %%ebx, %%edi;" + "cpuid;" + "xchgl %%ebx, %%edi;" + : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); +#else __asm__ __volatile__ ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); - +#endif } #endif From 53e6dbf6ca6e7c798e0ed0dfd24a78570a814553 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 30 Aug 2014 13:36:27 +0200 Subject: [PATCH 035/119] optimized sgemv_t kernel for small sizes --- kernel/x86_64/KERNEL.NEHALEM | 2 +- kernel/x86_64/sgemv_t_4.c | 235 +++++++++++++++++++++++ kernel/x86_64/sgemv_t_microk_nehalem-4.c | 99 ++++++++++ 3 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_t_4.c create mode 100644 kernel/x86_64/sgemv_t_microk_nehalem-4.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 8adb579cf..00c3b4d15 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -10,7 +10,7 @@ SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n.c SGEMMKERNEL = gemm_kernel_4x8_nehalem.S diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c new file mode 100644 index 000000000..e0eb9220b --- /dev/null +++ b/kernel/x86_64/sgemv_t_4.c @@ -0,0 +1,235 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(NEHALEM) +#include "sgemv_t_microk_nehalem-4.c" +#endif + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x4 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; +} + +#endif + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT temp = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + } + *y = temp; +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer); + ap[0] += lda4 ; + ap[1] += lda4 ; + ap[2] += lda4 ; + ap[3] += lda4 ; + a_ptr += lda4 ; + if ( inc_y == 1 ) + { + + __asm__ __volatile__ + ( + "movss (%0) , %%xmm10 \n\t" + "shufps $0 , %%xmm10 , %%xmm10 \n\t" + "movups (%1) , %%xmm12 \n\t" + "movups (%2) , %%xmm11 \n\t" + "mulps %%xmm10 , %%xmm12 \n\t" + + "addps %%xmm11 , %%xmm12 \n\t" + "movups %%xmm12, (%2) \n\t" + + : + : + "r" (&alpha), // 0 + "r" (ybuffer), // 1 + "r" (y_ptr) // 2 + : + "%xmm10", "%xmm11", "%xmm12", + "memory" + ); + + y_ptr += 4; + + } + else + { + *y_ptr += ybuffer[0]*alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1]*alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[2]*alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[3]*alpha; + y_ptr += inc_y; + } + } + + for( i = 0; i < n2 ; i++) + { + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += 1 * lda; + *y_ptr += ybuffer[0]*alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + xbuffer = buffer; + x_ptr = x; + for ( i=0; i< m3; i++ ) + { + xbuffer[i] = *x_ptr; + x_ptr += inc_x; + } + j=0; + a_ptr = a; + y_ptr = y; + while ( j < n) + { + FLOAT temp = 0.0; + for( i = 0; i < m3; i++ ) + { + temp += a_ptr[i] * xbuffer[i]; + } + a_ptr += lda; + y_ptr[0] += alpha * temp; + y_ptr += inc_y; + j++; + } + return(0); +} + + diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-4.c b/kernel/x86_64/sgemv_t_microk_nehalem-4.c new file mode 100644 index 000000000..4a167900e --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "xorps %%xmm4 , %%xmm4 \n\t" + "xorps %%xmm5 , %%xmm5 \n\t" + "xorps %%xmm6 , %%xmm6 \n\t" + "xorps %%xmm7 , %%xmm7 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0 + "movups (%5,%0,4), %%xmm9 \n\t" // 4 * a1 + "movups (%6,%0,4), %%xmm10 \n\t" // 4 * a2 + "movups (%7,%0,4), %%xmm11 \n\t" // 4 * a3 + + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm12, %%xmm9 \n\t" + "mulps %%xmm12, %%xmm10 \n\t" + "mulps %%xmm12, %%xmm11 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm9 , %%xmm5 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm10, %%xmm6 \n\t" + "addps %%xmm11, %%xmm7 \n\t" + + "jnz .L01LOOP%= \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm5, %%xmm5 \n\t" + "haddps %%xmm6, %%xmm6 \n\t" + "haddps %%xmm7, %%xmm7 \n\t" + + "haddps %%xmm4, %%xmm4 \n\t" + "haddps %%xmm5, %%xmm5 \n\t" + "haddps %%xmm6, %%xmm6 \n\t" + "haddps %%xmm7, %%xmm7 \n\t" + + "movss %%xmm4, (%3) \n\t" + "movss %%xmm5, 4(%3) \n\t" + "movss %%xmm6, 8(%3) \n\t" + "movss %%xmm7, 12(%3) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", + "memory" + ); + +} + + From e2fc8c8c2cd490d8774eb0d2b74e3060373a0199 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 30 Aug 2014 13:58:02 +0200 Subject: [PATCH 036/119] changed 1 test value (bug in lapack-testing?) --- lapack-netlib/TESTING/dstest.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/dstest.in b/lapack-netlib/TESTING/dstest.in index 4a31076a6..b5a9f29f4 100644 --- a/lapack-netlib/TESTING/dstest.in +++ b/lapack-netlib/TESTING/dstest.in @@ -1,6 +1,6 @@ Data file for testing DSGESV/DSPOSV LAPACK routines 12 Number of values of M -0 1 2 13 17 45 78 91 101 119 120 132 values of M (row dimension) +0 1 2 13 17 45 78 91 101 119 112 132 values of M (row dimension) 6 Number of values of NRHS 1 2 14 15 16 13 Values of NRHS (number of right hand sides) 30.0 Threshold value of test ratio From 848c0f16f7740563be56dc11f2b6c10ef174024e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 31 Aug 2014 13:23:44 +0200 Subject: [PATCH 037/119] optimized sgemv_t_4.c for small size --- kernel/x86_64/sgemv_t_4.c | 150 +++++++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 42 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index e0eb9220b..cefbaccd4 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -64,6 +64,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #endif +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; @@ -71,11 +73,51 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) a0 = ap; FLOAT temp = 0.0; + if (n <=0 ) return; +/* for ( i=0; i< n; i+=4 ) { temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; } *y = temp; +*/ + + i=0; + + __asm__ __volatile__ + ( + "xorps %%xmm10 , %%xmm10 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%3,%0,4) , %%xmm12 \n\t" + "movups (%4,%0,4) , %%xmm11 \n\t" + "mulps %%xmm11 , %%xmm12 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "subq $4 , %1 \n\t" + + "jnz .L01LOOP%= \n\t" + + "haddps %%xmm10, %%xmm10 \n\t" + "haddps %%xmm10, %%xmm10 \n\t" + + "movss %%xmm10, (%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 + : "cc", + "%xmm10", "%xmm11", "%xmm12", + "memory" + ); + + } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) @@ -89,6 +131,57 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) } } +static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + + BLASLONG i; + + if ( inc_dest != 1 ) + { + for ( i=0; i> 2 ; n2 = n & 3 ; @@ -140,65 +235,36 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *ap[4]; + FLOAT *yp; BLASLONG register lda4 = 4 * lda; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; + yp = ytemp; for( i = 0; i < n1 ; i++) { - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer); + sgemv_kernel_4x4(NB,ap,xbuffer,yp); ap[0] += lda4 ; ap[1] += lda4 ; ap[2] += lda4 ; ap[3] += lda4 ; - a_ptr += lda4 ; - if ( inc_y == 1 ) - { - - __asm__ __volatile__ - ( - "movss (%0) , %%xmm10 \n\t" - "shufps $0 , %%xmm10 , %%xmm10 \n\t" - "movups (%1) , %%xmm12 \n\t" - "movups (%2) , %%xmm11 \n\t" - "mulps %%xmm10 , %%xmm12 \n\t" - - "addps %%xmm11 , %%xmm12 \n\t" - "movups %%xmm12, (%2) \n\t" - - : - : - "r" (&alpha), // 0 - "r" (ybuffer), // 1 - "r" (y_ptr) // 2 - : - "%xmm10", "%xmm11", "%xmm12", - "memory" - ); - - y_ptr += 4; - - } - else - { - *y_ptr += ybuffer[0]*alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1]*alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[2]*alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[3]*alpha; - y_ptr += inc_y; - } + yp += 4; + } + if ( n1 > 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; } for( i = 0; i < n2 ; i++) { + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += 1 * lda; - *y_ptr += ybuffer[0]*alpha; + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; y_ptr += inc_y; } From bc99faef1bf2e1a98e99dcf6cfba2ea58ae0a56e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 31 Aug 2014 14:33:15 +0200 Subject: [PATCH 038/119] optimized sgemv_t_4.c for uneven sizes --- kernel/x86_64/sgemv_t_4.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index cefbaccd4..76187b57d 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -273,28 +273,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } if ( m3 == 0 ) return(0); - xbuffer = buffer; + x_ptr = x; + a_ptr = a; for ( i=0; i< m3; i++ ) { - xbuffer[i] = *x_ptr; - x_ptr += inc_x; - } - j=0; - a_ptr = a; - y_ptr = y; - while ( j < n) - { - FLOAT temp = 0.0; - for( i = 0; i < m3; i++ ) + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + for ( j=0; j Date: Sun, 31 Aug 2014 15:38:18 +0200 Subject: [PATCH 039/119] modified benchmark/gemv.c --- benchmark/gemv.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/benchmark/gemv.c b/benchmark/gemv.c index e26a36ac1..c5db09d89 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -151,23 +151,26 @@ int MAIN__(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); - if ((n>0) && (n<=to)) has_param_n = 1; + if ((n>0)) has_param_n = 1; } + int tomax = to; + if ( n > tomax ) tomax = n; + if ( has_param_n == 1 ) fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops); else fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } From d7f91f8b4f506b0e6071c61164a8e1c7ac8f32e9 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 1 Sep 2014 15:07:36 +0200 Subject: [PATCH 040/119] extended gemv.c benchmark --- benchmark/gemv.c | 103 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 33 deletions(-) diff --git a/benchmark/gemv.c b/benchmark/gemv.c index c5db09d89..e21868259 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){ blasint inc_x=1,inc_y=1; blasint n=0; int has_param_n = 0; + int has_param_m = 0; int loops = 1; int l; char *p; @@ -145,6 +146,9 @@ int MAIN__(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} + + int tomax = to; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); @@ -152,15 +156,18 @@ int MAIN__(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); if ((n>0)) has_param_n = 1; + if ( n > tomax ) tomax = n; } + if ( has_param_n == 0 ) + if ((p = getenv("OPENBLAS_PARAM_M"))) { + m = atoi(p); + if ((m>0)) has_param_m = 1; + if ( m > tomax ) tomax = m; + } - int tomax = to; - if ( n > tomax ) tomax = n; - if ( has_param_n == 1 ) - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops); - else - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); @@ -180,50 +187,80 @@ int MAIN__(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + if (has_param_m == 0) { - timeg=0; + for(m = from; m <= to; m += step) + { + timeg=0; + if ( has_param_n == 0 ) n = m; + fprintf(stderr, " %6dx%d : ", (int)m,(int)n); + for(j = 0; j < m; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } - if ( has_param_n == 0 ) n = m; + for (l=0; l Date: Mon, 1 Sep 2014 15:11:37 +0200 Subject: [PATCH 041/119] optimized sgemv_t_4.c for small sizes --- kernel/x86_64/sgemv_t_4.c | 123 ++++++++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 18 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 76187b57d..ae1279296 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -64,23 +64,63 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #endif +static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorps %%xmm10 , %%xmm10 \n\t" + "xorps %%xmm11 , %%xmm11 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%5,%0,4) , %%xmm14 \n\t" // x + "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 + "mulps %%xmm14 , %%xmm12 \n\t" + "mulps %%xmm14 , %%xmm13 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm13 , %%xmm11 \n\t" + + "jnz .L01LOOP%= \n\t" + + "haddps %%xmm10, %%xmm10 \n\t" + "haddps %%xmm11, %%xmm11 \n\t" + "haddps %%xmm10, %%xmm10 \n\t" + "haddps %%xmm11, %%xmm11 \n\t" + + "movss %%xmm10, (%2) \n\t" + "movss %%xmm11,4(%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 + "r" (x) // 5 + : "cc", + "%xmm10", "%xmm11", "%xmm12", + "memory" + ); + + +} + static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; - FLOAT *a0; - a0 = ap; - FLOAT temp = 0.0; - - if (n <=0 ) return; -/* - for ( i=0; i< n; i+=4 ) - { - temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - } - *y = temp; -*/ i=0; @@ -259,7 +299,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += n1 * lda4 ; } - for( i = 0; i < n2 ; i++) + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) { sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); @@ -276,20 +328,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO x_ptr = x; a_ptr = a; - for ( i=0; i< m3; i++ ) + if ( m3 == 3 ) { - FLOAT xtemp = *x_ptr * alpha; + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + FLOAT *aj = a_ptr; y_ptr = y; for ( j=0; j Date: Tue, 2 Sep 2014 12:42:36 +0200 Subject: [PATCH 042/119] optimized sgemv_t for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/sgemv_t_4.c | 6 +- kernel/x86_64/sgemv_t_microk_bulldozer-4.c | 147 +++++++++++++++++++++ 3 files changed, 152 insertions(+), 3 deletions(-) create mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer-4.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 6318b202c..346315aba 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -11,7 +11,7 @@ SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.c diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index ae1279296..5568b98cc 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" +#elif defined(BULLDOZER) +#include "sgemv_t_microk_bulldozer-4.c" #endif #define NBMAX 4096 @@ -202,9 +204,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "movups (%4,%0,4) , %%xmm11 \n\t" "mulps %%xmm10 , %%xmm12 \n\t" "addq $4 , %0 \n\t" - "addps %%xmm11 , %%xmm12 \n\t" + "addps %%xmm12 , %%xmm11 \n\t" "subq $4 , %1 \n\t" - "movups %%xmm12, -16(%4,%0,4) \n\t" + "movups %%xmm11, -16(%4,%0,4) \n\t" "jnz .L01LOOP%= \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c new file mode 100644 index 000000000..40e318de3 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" + "vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x + "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + ".align 2 \n\t" + "vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x + "vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + ".align 2 \n\t" + "vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t" + "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x + "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" + "vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t" + "addq $16, %0 \n\t" + "vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t" + "subq $16, %1 \n\t" + "vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From f3b50dcf5b1f06b8d778544f70d8e85e0f445090 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 2 Sep 2014 13:35:41 +0200 Subject: [PATCH 043/119] removed obsolete instructions from sgemv_t_4.c --- kernel/x86_64/sgemv_t_4.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 5568b98cc..2d0648a6c 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -377,8 +377,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr += inc_y; aj += lda; } - x_ptr += inc_x; - a_ptr++ ; return(0); } From 210bec9111c5252dfe600795c3ac63baaa060a9c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 2 Sep 2014 14:11:42 +0200 Subject: [PATCH 044/119] added plot-header to compare multithreading --- benchmark/tplot-header | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 benchmark/tplot-header diff --git a/benchmark/tplot-header b/benchmark/tplot-header new file mode 100644 index 000000000..b7ce7f225 --- /dev/null +++ b/benchmark/tplot-header @@ -0,0 +1,42 @@ +# ********************************************************************************** +# Copyright (c) 2014, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ********************************************************************************** + +set term x11 font sans; +set ylabel "MFlops"; +set xlabel "Size"; +set grid xtics; +set grid ytics; +set key left; +set timestamp "generated on %Y-%m-%d by `whoami`" +set title "Sgemv\nTRANS=T\nBulldozer" +plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier; +set output "print.png"; +show title; +show plot; +show output; + + From f4ff889491de5d95d24d9d4edcbd85b0f83ff380 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 2 Sep 2014 16:30:04 +0200 Subject: [PATCH 045/119] updated interface/gemv.c for multithreading --- interface/gemv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/interface/gemv.c b/interface/gemv.c index 08553ad21..3bcc099a5 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -215,8 +215,9 @@ void CNAME(enum CBLAS_ORDER order, int nthreads_max = num_cpu_avail(2); int nthreads_avail = nthreads_max; + double MNK = (double) m * (double) n; - if ( MNK <= (500.0 * 100.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= (128.0 * 32.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) From d1800397f592226cd0cb933303c09de325034412 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 2 Sep 2014 17:36:07 +0200 Subject: [PATCH 046/119] optimized interface/gemv.c for multithreading --- interface/gemv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index 3bcc099a5..64dc641d0 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -215,9 +215,8 @@ void CNAME(enum CBLAS_ORDER order, int nthreads_max = num_cpu_avail(2); int nthreads_avail = nthreads_max; - double MNK = (double) m * (double) n; - if ( MNK <= (128.0 * 32.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= (96.0 * 24.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) From 0fc560ba239767098f05f2f13161b036b2eb805d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 3 Sep 2014 10:13:47 +0200 Subject: [PATCH 047/119] bugfix for buffer overflow --- kernel/x86_64/sgemv_t_4.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 2d0648a6c..fb0ba9741 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -232,6 +232,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; + BLASLONG n0; BLASLONG n1; BLASLONG m1; BLASLONG m2; @@ -246,7 +247,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO xbuffer = buffer; ytemp = buffer + NBMAX; - n1 = n >> 2 ; + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; n2 = n & 3 ; m3 = m & 3 ; @@ -283,6 +285,32 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j Date: Wed, 3 Sep 2014 14:48:45 +0200 Subject: [PATCH 048/119] optimized sgemv_n for small sizes --- kernel/x86_64/KERNEL.NEHALEM | 2 +- kernel/x86_64/sgemv_n_4.c | 319 +++++++++++++++++++++++ kernel/x86_64/sgemv_n_microk_nehalem-4.c | 185 +++++++++++++ 3 files changed, 505 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_n_4.c create mode 100644 kernel/x86_64/sgemv_n_microk_nehalem-4.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 00c3b4d15..68c741cea 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -9,7 +9,7 @@ DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c -SGEMVNKERNEL = sgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c new file mode 100644 index 000000000..f84016075 --- /dev/null +++ b/kernel/x86_64/sgemv_n_4.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "sgemv_n_microk_bulldozer-4.c" +#elif defined(NEHALEM) +#include "sgemv_n_microk_nehalem-4.c" +#endif + + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x8 + +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT *b0,*b1,*b2,*b3; + FLOAT *x4; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x4 = x + 4; + + for ( i=0; i< n; i+=4 ) + { + + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + + y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3]; + y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3]; + y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3]; + y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3]; + + } +} + +#endif + + +#ifndef HAVE_KERNEL_4x4 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + FLOAT da = *alpha; + for ( i=0; i Date: Wed, 3 Sep 2014 15:34:30 +0200 Subject: [PATCH 049/119] optimized sgemv_n_4.c --- kernel/x86_64/sgemv_n_4.c | 34 ++++++++++++++---------- kernel/x86_64/sgemv_n_microk_nehalem-4.c | 5 +++- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index f84016075..31d841ddd 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -174,9 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG n1; BLASLONG m1; BLASLONG m2; + BLASLONG m3; BLASLONG n2; - BLASLONG lda4 = 4 * lda; - BLASLONG lda8 = 8 * lda; + BLASLONG lda4 = lda << 2; + BLASLONG lda8 = lda << 3; FLOAT xbuffer[8],*ybuffer; if ( m < 1 ) return(0); @@ -186,19 +187,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( inc_x == 1 ) { - n1 = n / 8 ; - n2 = n % 8 ; + n1 = n >> 3 ; + n2 = n & 7 ; } else { - n1 = n / 4 ; - n2 = n % 4 ; + n1 = n >> 2 ; + n2 = n & 3 ; } - m1 = m - ( m % 4 ); - m2 = (m % NBMAX) - (m % 4) ; - + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; BLASLONG NB = NBMAX; @@ -237,8 +240,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO x_ptr += 8; } -/* - for( i = 0; i < n1 ; i++) + + if ( n2 & 4 ) { sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer); ap[0] += lda4; @@ -248,8 +251,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda4; x_ptr += 4; } -*/ - for( i = 0; i < n2 ; i++) + + for( i = 0; i < ( n2 & 3 ) ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; @@ -296,8 +299,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a += NB; y_ptr += NB * inc_y; } + + if ( m3 == 0 ) return; + j=0; - while ( j < (m % 4)) + while ( j < m3 ) { a_ptr = a; x_ptr = x; diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index accc529b3..f87cfa425 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -58,13 +58,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" ".L01LOOP%=: \n\t" - "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "xorps %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + ".align 2 \n\t" "movups (%4,%0,4), %%xmm8 \n\t" "movups (%5,%0,4), %%xmm9 \n\t" "movups (%6,%0,4), %%xmm10 \n\t" "movups (%7,%0,4), %%xmm11 \n\t" + ".align 2 \n\t" "mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm13, %%xmm9 \n\t" "mulps %%xmm14, %%xmm10 \n\t" @@ -78,6 +80,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "movups (%5,%8,4), %%xmm9 \n\t" "movups (%6,%8,4), %%xmm10 \n\t" "movups (%7,%8,4), %%xmm11 \n\t" + ".align 2 \n\t" "mulps %%xmm0 , %%xmm8 \n\t" "mulps %%xmm1 , %%xmm9 \n\t" "mulps %%xmm2 , %%xmm10 \n\t" From 7f910010a08f84b6ed74149f6cdcaaa71ca7f09b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 4 Sep 2014 13:09:27 +0200 Subject: [PATCH 050/119] optimized sgemv_n kernel for small sizes --- kernel/x86_64/sgemv_n_4.c | 41 +++- kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 254 +++++++++++++++++++++ 2 files changed, 283 insertions(+), 12 deletions(-) create mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer-4.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 31d841ddd..f1573dd30 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -185,17 +185,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer = buffer; - if ( inc_x == 1 ) - { - n1 = n >> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } + n1 = n >> 3 ; + n2 = n & 7 ; m3 = m & 3 ; m1 = m & -4 ; @@ -267,6 +258,32 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[4] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[5] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[6] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[7] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + } + + if ( n2 & 4 ) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; @@ -284,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda4; } - for( i = 0; i < n2 ; i++) + for( i = 0; i < ( n2 & 3) ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c new file mode 100644 index 000000000..53287df75 --- /dev/null +++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c @@ -0,0 +1,254 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); + +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%xmm12 \n\t" // x0 + "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 + "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 + "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 + "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 + "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 + "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 + "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" + "addq $4 , %0 \n\t" + + "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "addq $4 , %8 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm6 \n\t" + "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y + + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vmovups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y + + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + + "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "addq $8 , %8 \n\t" + "subq $8 , %1 \n\t" + + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + ".align 2 \n\t" + "vmovups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y + "vmovups 32(%3,%0,4), %%xmm6 \n\t" // 4 * y + "vmovups 48(%3,%0,4), %%xmm7 \n\t" // 4 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" + ".align 2 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" + + "prefetcht0 192(%4,%8,4) \n\t" + "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" + "prefetcht0 192(%5,%8,4) \n\t" + "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "prefetcht0 192(%6,%8,4) \n\t" + "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" + "prefetcht0 192(%7,%8,4) \n\t" + "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + + "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" + "addq $16, %0 \n\t" + "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" + + "addq $16, %8 \n\t" + "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y + "subq $16, %1 \n\t" + "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4) // 8 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%xmm12 \n\t" // x0 + "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 + "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 + "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" + + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vaddps %%xmm4, %%xmm5, %%xmm6 \n\t" + + "vmovups %%xmm6, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 53de943690abc2f500ae131627136c9fbd35e541 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 4 Sep 2014 18:55:52 +0200 Subject: [PATCH 051/119] bugfix for sgemv_n_4.c --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/sgemv_n_4.c | 41 ++++++++++------------------------ 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 346315aba..0fd7ac35f 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -10,7 +10,7 @@ DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c -SGEMVNKERNEL = sgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index f1573dd30..31d841ddd 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -185,8 +185,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer = buffer; - n1 = n >> 3 ; - n2 = n & 7 ; + if ( inc_x == 1 ) + { + n1 = n >> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } m3 = m & 3 ; m1 = m & -4 ; @@ -258,32 +267,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[4] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[5] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[6] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[7] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - } - - if ( n2 & 4 ) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; @@ -301,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda4; } - for( i = 0; i < ( n2 & 3) ; i++) + for( i = 0; i < n2 ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; From 6df7a8893078e2f9878efeb7212fb7030185cf37 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 5 Sep 2014 10:22:50 +0200 Subject: [PATCH 052/119] optimized sgemv_t for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 +- kernel/x86_64/sgemv_t_4.c | 2 + kernel/x86_64/sgemv_t_microk_sandy-4.c | 174 +++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_t_microk_sandy-4.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index b654d3564..b70486436 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,5 +1,5 @@ SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index fb0ba9741..b89ec7f7f 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_nehalem-4.c" #elif defined(BULLDOZER) #include "sgemv_t_microk_bulldozer-4.c" +#elif defined(SANDYBRIDGE) +#include "sgemv_t_microk_sandy-4.c" #endif #define NBMAX 4096 diff --git a/kernel/x86_64/sgemv_t_microk_sandy-4.c b/kernel/x86_64/sgemv_t_microk_sandy-4.c new file mode 100644 index 000000000..6550518f7 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm0 , %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1 , %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2 , %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3 , %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm12, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm12, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm12, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm9 , %%xmm6 \n\t" + "subq $4 , %1 \n\t" + "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm12, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm12, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm9 , %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm13, %%ymm9 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm0, %%ymm9 , %%ymm0 \n\t" + "vaddps %%ymm1, %%ymm10, %%ymm1 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm13, %%ymm9 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm12, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm6, %%ymm8 , %%ymm6 \n\t" + "addq $16, %0 \n\t" + "vaddps %%ymm2, %%ymm9 , %%ymm2 \n\t" + "vaddps %%ymm7, %%ymm10, %%ymm7 \n\t" + "subq $16, %1 \n\t" + "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" + "vaddps %%ymm6, %%ymm2, %%ymm6 \n\t" + "vaddps %%ymm7, %%ymm3, %%ymm7 \n\t" + + "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 2021d0f9d6e155997450ab199d8af7e0a3a8551a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 5 Sep 2014 15:05:53 +0200 Subject: [PATCH 053/119] experimentally removed expensive function calls --- common_x86_64.h | 9 +++++++++ driver/others/parameter.c | 2 ++ 2 files changed, 11 insertions(+) diff --git a/common_x86_64.h b/common_x86_64.h index 0f842ee94..ae9b88718 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -46,6 +46,7 @@ #define __volatile__ #endif +/* #ifdef HAVE_SSE2 #define MB __asm__ __volatile__ ("mfence"); #define WMB __asm__ __volatile__ ("sfence"); @@ -53,6 +54,10 @@ #define MB #define WMB #endif +*/ + +#define MB +#define WMB static void __inline blas_lock(volatile BLASULONG *address){ @@ -99,6 +104,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ : "0" (op)); } +/* + #define WHEREAMI static inline int WhereAmI(void){ @@ -111,6 +118,8 @@ static inline int WhereAmI(void){ return apicid; } +*/ + #ifdef CORE_BARCELONA #define IFLUSH gotoblas_iflush() #define IFLUSH_HALF gotoblas_iflush_half() diff --git a/driver/others/parameter.c b/driver/others/parameter.c index a0a8b5188..c6c7301e8 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -251,7 +251,9 @@ void blas_set_parameter(void){ env_var_t p; int factor; +#if !defined(BULLDOZER) int size = get_L2_size(); +#endif #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) size >>= 7; From a64fe9bcc95b5378d47c424f615da95d38a9ec43 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 08:41:53 +0200 Subject: [PATCH 054/119] added optimized sgemv_n kernel for sandybridge --- driver/others/parameter.c | 4 +- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 +- kernel/x86_64/sgemv_n_4.c | 2 + kernel/x86_64/sgemv_n_microk_sandy-4.c | 322 +++++++++++++++++++++++++ 4 files changed, 328 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/sgemv_n_microk_sandy-4.c diff --git a/driver/others/parameter.c b/driver/others/parameter.c index c6c7301e8..f0f889a15 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -251,7 +251,9 @@ void blas_set_parameter(void){ env_var_t p; int factor; -#if !defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) + int size = 16; +#else int size = get_L2_size(); #endif diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index b70486436..dfc2882aa 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,4 +1,4 @@ -SGEMVNKERNEL = sgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 31d841ddd..617b1788f 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -33,6 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_bulldozer-4.c" #elif defined(NEHALEM) #include "sgemv_n_microk_nehalem-4.c" +#elif defined(SANDYBRIDGE) +#include "sgemv_n_microk_sandy-4.c" #endif diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c new file mode 100644 index 000000000..b4caca630 --- /dev/null +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -0,0 +1,322 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + + +#define HAVE_KERNEL_4x8 1 +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); + +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + + "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" + "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" + "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" + "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + + "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + + "addq $4, %8 \n\t" + "addq $4, %0 \n\t" + "subq $4, %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + + "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" + "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" + "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" + "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + + "addq $8, %8 \n\t" + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%4,%8,4) \n\t" + "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" + "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" + "prefetcht0 192(%5,%8,4) \n\t" + "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" + "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%6,%8,4) \n\t" + "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" + "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" + "prefetcht0 192(%7,%8,4) \n\t" + "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" + "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %8 \n\t" + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4) // 8 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + + "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + + "addq $4, %0 \n\t" + "subq $4, %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 77942374759502f740ccd7ec9130e7e790494d3a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 11:01:42 +0200 Subject: [PATCH 055/119] undef WHEREAMI --- common_x86_64.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index ae9b88718..547614f74 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -105,8 +105,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ } /* - #define WHEREAMI +*/ static inline int WhereAmI(void){ int eax, ebx, ecx, edx; @@ -118,7 +118,6 @@ static inline int WhereAmI(void){ return apicid; } -*/ #ifdef CORE_BARCELONA #define IFLUSH gotoblas_iflush() From d143f84dd26219e4a8d62e545a5449d47fe80583 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 12:08:48 +0200 Subject: [PATCH 056/119] added optimized sgemv_n kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/sgemv_n_4.c | 2 + kernel/x86_64/sgemv_n_microk_haswell-4.c | 271 +++++++++++++++++++++++ 3 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_n_microk_haswell-4.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index d0ac9c72f..c2c64939b 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,4 +1,4 @@ -SGEMVNKERNEL = sgemv_n.c +SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t.c DGEMVNKERNEL = dgemv_n.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 617b1788f..943dcdefa 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" +#elif defined(HASWELL) +#include "sgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c new file mode 100644 index 000000000..ed1792245 --- /dev/null +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -0,0 +1,271 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); + +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" + + "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %8 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + + "addq $8 , %8 \n\t" + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "prefetcht0 192(%4,%8,4) \n\t" + "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" + "prefetcht0 192(%5,%8,4) \n\t" + "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" + "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" + "prefetcht0 192(%6,%8,4) \n\t" + "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" + "prefetcht0 192(%7,%8,4) \n\t" + "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" + "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %8 \n\t" + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4) // 8 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm4 \n\t" + + "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + + "prefetcht0 192(%4,%0,4) \n\t" + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From cf5544b41750fae89ec2c3e83f6ed70ca2d508dc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 13:17:56 +0200 Subject: [PATCH 057/119] optimization for small size --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 25 ++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index ed1792245..a2470a4b7 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -105,41 +105,42 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" ".L01LOOP%=: \n\t" + // "prefetcht0 192(%3,%0,4) \n\t" "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y - "prefetcht0 192(%4,%0,4) \n\t" + // "prefetcht0 192(%4,%0,4) \n\t" "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" + // "prefetcht0 192(%5,%0,4) \n\t" "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" + // "prefetcht0 192(%6,%0,4) \n\t" "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" + // "prefetcht0 192(%7,%0,4) \n\t" "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" - "prefetcht0 192(%4,%8,4) \n\t" + // "prefetcht0 192(%4,%8,4) \n\t" "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "addq $16, %0 \n\t" "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" - "prefetcht0 192(%5,%8,4) \n\t" + // "prefetcht0 192(%5,%8,4) \n\t" "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" - "prefetcht0 192(%6,%8,4) \n\t" + // "prefetcht0 192(%6,%8,4) \n\t" "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" - "prefetcht0 192(%7,%8,4) \n\t" + // "prefetcht0 192(%7,%8,4) \n\t" "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y - "addq $16, %8 \n\t" - "addq $16, %0 \n\t" + "vmovups %%ymm4,-64(%3,%0,4) \n\t" // 8 * y "subq $16, %1 \n\t" + "vmovups %%ymm5,-32(%3,%0,4) \n\t" // 8 * y + "jnz .L01LOOP%= \n\t" ".L16END%=: \n\t" From 3a7ab47ee95a34d113e68003a37c81eb70d74a6b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 18:34:25 +0200 Subject: [PATCH 058/119] optimized sgemv_t --- kernel/x86_64/sgemv_t_4.c | 69 ++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index b89ec7f7f..e4476080a 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -80,9 +80,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT ( "xorps %%xmm10 , %%xmm10 \n\t" "xorps %%xmm11 , %%xmm11 \n\t" - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" + + "testq $4 , %1 \n\t" + "jz .L01LABEL%= \n\t" "movups (%5,%0,4) , %%xmm14 \n\t" // x "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 @@ -94,8 +94,36 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "subq $4 , %1 \n\t" "addps %%xmm13 , %%xmm11 \n\t" + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%5,%0,4) , %%xmm14 \n\t" // x + "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 + "mulps %%xmm14 , %%xmm12 \n\t" + "mulps %%xmm14 , %%xmm13 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "addps %%xmm13 , %%xmm11 \n\t" + + "movups 16(%5,%0,4) , %%xmm14 \n\t" // x + "movups 16(%3,%0,4) , %%xmm12 \n\t" // ap0 + "movups 16(%4,%0,4) , %%xmm13 \n\t" // ap1 + "mulps %%xmm14 , %%xmm12 \n\t" + "mulps %%xmm14 , %%xmm13 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "addps %%xmm13 , %%xmm11 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" "jnz .L01LOOP%= \n\t" + ".L01END%=: \n\t" + "haddps %%xmm10, %%xmm10 \n\t" "haddps %%xmm11, %%xmm11 \n\t" "haddps %%xmm10, %%xmm10 \n\t" @@ -113,7 +141,8 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "r" (ap1), // 4 "r" (x) // 5 : "cc", - "%xmm10", "%xmm11", "%xmm12", + "%xmm4", "%xmm5", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -130,10 +159,11 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __asm__ __volatile__ ( + "xorps %%xmm9 , %%xmm9 \n\t" "xorps %%xmm10 , %%xmm10 \n\t" - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" + + "testq $4 , %1 \n\t" + "jz .L01LABEL%= \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" "movups (%4,%0,4) , %%xmm11 \n\t" @@ -142,8 +172,30 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addps %%xmm12 , %%xmm10 \n\t" "subq $4 , %1 \n\t" + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%3,%0,4) , %%xmm12 \n\t" + "movups 16(%3,%0,4) , %%xmm14 \n\t" + "movups (%4,%0,4) , %%xmm11 \n\t" + "movups 16(%4,%0,4) , %%xmm13 \n\t" + "mulps %%xmm11 , %%xmm12 \n\t" + "mulps %%xmm13 , %%xmm14 \n\t" + "addq $8 , %0 \n\t" + "addps %%xmm12 , %%xmm10 \n\t" + "subq $8 , %1 \n\t" + "addps %%xmm14 , %%xmm9 \n\t" + "jnz .L01LOOP%= \n\t" + ".L01END%=: \n\t" + + "addps %%xmm9 , %%xmm10 \n\t" "haddps %%xmm10, %%xmm10 \n\t" "haddps %%xmm10, %%xmm10 \n\t" @@ -157,7 +209,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "r" (ap), // 3 "r" (x) // 4 : "cc", - "%xmm10", "%xmm11", "%xmm12", + "%xmm9", "%xmm10" , + "%xmm11", "%xmm12", "%xmm13", "%xmm14", "memory" ); From c8eaf3ae2d19a60039f55e5579c44329ff2d3000 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 6 Sep 2014 19:41:57 +0200 Subject: [PATCH 059/119] optimized sgemv_t_4 kernel for very small sizes --- kernel/x86_64/sgemv_t_4.c | 98 +++++++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index e4476080a..692dd536d 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -423,14 +423,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *aj = a_ptr; y_ptr = y; - for ( j=0; j Date: Sat, 6 Sep 2014 21:28:57 +0200 Subject: [PATCH 060/119] better optimzations for sgemv_t kernel --- kernel/x86_64/sgemv_t_4.c | 113 +++++++++++++++++++++++++++++++++----- 1 file changed, 99 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 692dd536d..920322c4f 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -446,12 +446,45 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } else { - for ( j=0; j Date: Sun, 7 Sep 2014 13:45:03 +0200 Subject: [PATCH 061/119] optimizations for very small sizes --- kernel/x86_64/sgemv_n_4.c | 218 +++++++++++++++++---- kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 65 +++--- 2 files changed, 216 insertions(+), 67 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 943dcdefa..ee762ffce 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -44,12 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_4x8 -static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; FLOAT *b0,*b1,*b2,*b3; FLOAT *x4; + FLOAT x[8]; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; @@ -60,6 +61,9 @@ static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLON b3 = a3 + lda4 ; x4 = x + 4; + for ( i=0; i<8; i++) + x[i] = xo[i] * *alpha; + for ( i=0; i< n; i+=4 ) { @@ -81,15 +85,19 @@ static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLON #ifndef HAVE_KERNEL_4x4 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4]; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + for ( i=0; i< n; i+=4 ) { y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; @@ -101,32 +109,147 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #endif -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +#ifndef HAVE_KERNEL_4x2 + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0; - a0 = ap; - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0]; - y[i+1] += a0[i+1]*x[0]; - y[i+2] += a0[i+2]*x[0]; - y[i+3] += a0[i+3]*x[0]; - } + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movss (%2) , %%xmm12 \n\t" // x0 + "movss (%6) , %%xmm4 \n\t" // alpha + "movss 4(%2) , %%xmm13 \n\t" // x1 + "mulss %%xmm4 , %%xmm12 \n\t" // alpha + "mulss %%xmm4 , %%xmm13 \n\t" // alpha + "shufps $0, %%xmm12, %%xmm12 \n\t" + "shufps $0, %%xmm13, %%xmm13 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + + "movups (%4,%0,4), %%xmm8 \n\t" + "movups (%5,%0,4), %%xmm9 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "addps %%xmm9 , %%xmm4 \n\t" + + "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y + + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "movss (%2), %%xmm12 \n\t" // x0 + "mulss (%6), %%xmm12 \n\t" // alpha + "shufps $0, %%xmm12, %%xmm12 \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y + "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a + "movups 16(%4,%0,4), %%xmm9 \n\t" // 4 * a + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm12, %%xmm9 \n\t" + "addps %%xmm4 , %%xmm8 \n\t" + "addps %%xmm5 , %%xmm9 \n\t" + + "addq $8 , %0 \n\t" + "movups %%xmm8 , -32(%3,%0,4) \n\t" // 4 * y + "movups %%xmm9 , -16(%3,%0,4) \n\t" // 4 * y + + "subq $8 , %1 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + "testq $0x04, %5 \n\t" + "jz .L08LABEL%= \n\t" + + "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a + "mulps %%xmm12, %%xmm8 \n\t" + "addps %%xmm8 , %%xmm4 \n\t" + "movups %%xmm4 , (%3,%0,4) \n\t" // 4 * y + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (n2), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + } - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) __attribute__ ((noinline)); -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT *alpha) +#endif + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; if ( inc_dest != 1 ) { - FLOAT da = *alpha; for ( i=0; i Date: Sun, 7 Sep 2014 18:23:48 +0200 Subject: [PATCH 062/119] optimized sgemv_n for very small size of m --- kernel/x86_64/sgemv_n_4.c | 148 +++++++++++++++++++++++++++++++++++--- 1 file changed, 138 insertions(+), 10 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index ee762ffce..0135306af 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -438,25 +438,153 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } - if ( m3 == 0 ) return; + if ( m3 == 0 ) return(0); - j=0; - while ( j < m3 ) + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) { a_ptr = a; x_ptr = x; FLOAT temp = 0.0; - for( i = 0; i < n; i++ ) + if ( lda == 1 && inc_x ==1 ) { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } y_ptr[0] += alpha * temp; - y_ptr += inc_y; - a++; - j++; + return(0); } + + return(0); } From 7b3932b3f348e88cfd9462463bb5ac1f6a5d3a8e Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 7 Sep 2014 19:20:08 +0200 Subject: [PATCH 063/119] optimized sgemv_n kernel for nehalem --- kernel/x86_64/sgemv_n_microk_nehalem-4.c | 42 ++++++++++++++++-------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index f87cfa425..77a1b11aa 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HAVE_KERNEL_4x8 1 -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; @@ -55,11 +55,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "shufps $0, %%xmm2 , %%xmm2 \n\t" "shufps $0, %%xmm3 , %%xmm3 \n\t" + "movss (%9), %%xmm6 \n\t" // alpha + "shufps $0, %%xmm6 , %%xmm6 \n\t" + ".align 16 \n\t" ".L01LOOP%=: \n\t" + "xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm5 , %%xmm5 \n\t" - "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y ".align 2 \n\t" "movups (%4,%0,4), %%xmm8 \n\t" @@ -85,16 +89,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "mulps %%xmm1 , %%xmm9 \n\t" "mulps %%xmm2 , %%xmm10 \n\t" "mulps %%xmm3 , %%xmm11 \n\t" - "addq $4 , %8 \n\t" "addps %%xmm8 , %%xmm4 \n\t" "addps %%xmm9 , %%xmm5 \n\t" - "addq $4 , %0 \n\t" "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "subq $4 , %1 \n\t" - "addps %%xmm4 , %%xmm5 \n\t" - "movups %%xmm5 , -16(%3,%0,4) \n\t" // 4 * y + "addq $4 , %8 \n\t" + "addps %%xmm5 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "mulps %%xmm6 , %%xmm4 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm4 , %%xmm7 \n\t" + + "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y "jnz .L01LOOP%= \n\t" @@ -108,11 +115,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 - "r" (lda4) // 8 + "r" (lda4), // 8 + "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", + "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" @@ -124,9 +133,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO #define HAVE_KERNEL_4x4 1 -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; @@ -142,9 +151,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "shufps $0, %%xmm14, %%xmm14\n\t" "shufps $0, %%xmm15, %%xmm15\n\t" + "movss (%8), %%xmm6 \n\t" // alpha + "shufps $0, %%xmm6 , %%xmm6 \n\t" + ".align 16 \n\t" ".L01LOOP%=: \n\t" - "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "xorps %%xmm4 , %%xmm4 \n\t" + "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y "movups (%4,%0,4), %%xmm8 \n\t" "movups (%5,%0,4), %%xmm9 \n\t" @@ -161,6 +174,8 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addps %%xmm10 , %%xmm4 \n\t" "addps %%xmm4 , %%xmm11 \n\t" + "mulps %%xmm6 , %%xmm11 \n\t" + "addps %%xmm7 , %%xmm11 \n\t" "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y "jnz .L01LOOP%= \n\t" @@ -174,7 +189,8 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 - "r" (ap[3]) // 7 + "r" (ap[3]), // 7 + "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From 553e2754077eb0b2cb8782e8ecdb5e6eb9c8366b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 7 Sep 2014 20:53:30 +0200 Subject: [PATCH 064/119] optimized sgemv_n kernel for sandybridge --- kernel/x86_64/sgemv_n_microk_sandy-4.c | 112 +++++++++++++++++-------- 1 file changed, 79 insertions(+), 33 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index b4caca630..44c2b3f2b 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -29,9 +29,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HAVE_KERNEL_4x8 1 -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; @@ -48,61 +48,75 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + "testq $0x04, %1 \n\t" "jz .L08LABEL%= \n\t" - "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" + "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" + "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" + "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "addq $4, %8 \n\t" "addq $4, %0 \n\t" "subq $4, %1 \n\t" - ".L08LABEL%=: \n\t" + ".L08LABEL%=: \n\t" "testq $0x08, %1 \n\t" "jz .L16LABEL%= \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" + "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "addq $8, %8 \n\t" "addq $8, %0 \n\t" @@ -117,8 +131,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" ".L01LOOP%=: \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" @@ -164,6 +178,12 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" + + "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y + "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y @@ -185,11 +205,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 - "r" (lda4) // 8 + "r" (lda4), // 8 + "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", + "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" @@ -201,9 +223,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO #define HAVE_KERNEL_4x4 1 -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; @@ -216,21 +238,29 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss (%8), %%ymm6 \n\t" // alpha + "testq $0x04, %1 \n\t" "jz .L08LABEL%= \n\t" - "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm10, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" - "vaddps %%xmm4, %%xmm11, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" + "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" + "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "addq $4, %0 \n\t" "subq $4, %1 \n\t" @@ -240,18 +270,24 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "testq $0x08, %1 \n\t" "jz .L16LABEL%= \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" - "vaddps %%ymm4, %%ymm11, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" + "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "addq $8, %0 \n\t" "subq $8, %1 \n\t" @@ -265,8 +301,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm1 \n\t" // 8 * y "prefetcht0 192(%4,%0,4) \n\t" "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" @@ -290,8 +328,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" + + "vaddps %%ymm4, %%ymm0 , %%ymm0 \n\t" + "vaddps %%ymm5, %%ymm1 , %%ymm1 \n\t" + + "vmovups %%ymm0, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm1, 32(%3,%0,4) \n\t" // 8 * y "addq $16, %0 \n\t" "subq $16, %1 \n\t" @@ -309,8 +353,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 - "r" (ap[3]) // 7 + "r" (ap[3]), // 7 + "r" (alpha) // 8 : "cc", + "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", From 80f77868758a6e99c2716dd12d8bfb63d6ed015f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 7 Sep 2014 21:13:57 +0200 Subject: [PATCH 065/119] enabled optimized sgemv kernels for piledriver --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/KERNEL.PILEDRIVER | 4 ++-- kernel/x86_64/sgemv_t_4.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index c2c64939b..d0ac9c72f 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,4 +1,4 @@ -SGEMVNKERNEL = sgemv_n_4.c +SGEMVNKERNEL = sgemv_n.c SGEMVTKERNEL = sgemv_t.c DGEMVNKERNEL = dgemv_n.c diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 146a8768b..4f15e5a36 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,5 +1,5 @@ -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.S diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 920322c4f..3316473af 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" -#elif defined(BULLDOZER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" From 2be5c7a640488796d98ac3cbb44004a39491da7f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 7 Sep 2014 21:48:42 +0200 Subject: [PATCH 066/119] bugfix for windows --- kernel/x86_64/sgemv_n_microk_sandy-4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index 44c2b3f2b..c162eeeb6 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -357,7 +357,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", + "%xmm2", "%xmm3", "%xmm4", "%xmm5", + "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" From cbbc80aad3586900443ed7fef1d0ff1814a80e9a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 10:13:39 +0200 Subject: [PATCH 067/119] added optimized sgemv_t kernel for haswell --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 275 ++++++++++++----------- kernel/x86_64/sgemv_t_microk_haswell-4.c | 148 ++++++++++++ 2 files changed, 298 insertions(+), 125 deletions(-) create mode 100644 kernel/x86_64/sgemv_t_microk_haswell-4.c diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index a2470a4b7..1e4498d9e 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HAVE_KERNEL_4x8 1 -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) __attribute__ ((noinline)); +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4) +static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; @@ -47,10 +47,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + "testq $0x04, %1 \n\t" "jz .L08LABEL%= \n\t" - "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" @@ -64,6 +67,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" + "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y @@ -76,7 +81,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "testq $0x08, %1 \n\t" "jz .L16LABEL%= \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" @@ -90,6 +96,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y @@ -105,42 +114,160 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO ".align 16 \n\t" ".L01LOOP%=: \n\t" - // "prefetcht0 192(%3,%0,4) \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y - // "prefetcht0 192(%4,%0,4) \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" - // "prefetcht0 192(%5,%0,4) \n\t" "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" - // "prefetcht0 192(%6,%0,4) \n\t" "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" - // "prefetcht0 192(%7,%0,4) \n\t" "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" - // "prefetcht0 192(%4,%8,4) \n\t" "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" - "addq $16, %0 \n\t" + "addq $16, %0 \n\t" "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" - // "prefetcht0 192(%5,%8,4) \n\t" "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" - // "prefetcht0 192(%6,%8,4) \n\t" "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" - // "prefetcht0 192(%7,%8,4) \n\t" "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" - "addq $16, %8 \n\t" - "vmovups %%ymm4,-64(%3,%0,4) \n\t" // 8 * y - "subq $16, %1 \n\t" - "vmovups %%ymm5,-32(%3,%0,4) \n\t" // 8 * y + "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" + "addq $16, %8 \n\t" + "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y + "subq $16, %1 \n\t" + "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + + "vbroadcastss (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" + "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" + + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovups %%ymm8, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm9, 32(%3,%0,4) \n\t" // 8 * y + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" "jnz .L01LOOP%= \n\t" ".L16END%=: \n\t" @@ -156,113 +283,11 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 - "r" (lda4) // 8 - : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -#define HAVE_KERNEL_4x4 1 -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); - -static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastss (%2), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 - - "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" - - "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y - - "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm4 \n\t" - - "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - ".L08LABEL%=: \n\t" - - "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" - - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" - - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" - - ".L16LABEL%=: \n\t" - - "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" - - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y - - "prefetcht0 192(%4,%0,4) \n\t" - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" - "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" - "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" - - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y - - "addq $16, %0 \n\t" - "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" - - ".L16END%=: \n\t" - "vzeroupper \n\t" - - : - : - "r" (i), // 0 - "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]) // 7 + "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c new file mode 100644 index 000000000..016cb35e7 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x + + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm12, %%xmm7 \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + + "testq $0x08, %1 \n\t" + "jz .L16LABEL%= \n\t" + + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t" + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + + ".L16LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 384(%2,%0,4) \n\t" + "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x + "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x + + "prefetcht0 384(%4,%0,4) \n\t" + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t" + "prefetcht0 384(%5,%0,4) \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "prefetcht0 384(%6,%0,4) \n\t" + "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t" + "prefetcht0 384(%7,%0,4) \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7 \n\t" + + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + + "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" + "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" + "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" + "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" + + "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" + "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" + "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" + "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" + + "vmovss %%xmm4, (%3) \n\t" + "vmovss %%xmm5, 4(%3) \n\t" + "vmovss %%xmm6, 8(%3) \n\t" + "vmovss %%xmm7, 12(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 7c0a94ff472bcb01d666ca0bd6975c0b24267680 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 10:54:33 +0200 Subject: [PATCH 068/119] bugfix in sgemv_n_microk_haswell-4.c --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 1e4498d9e..8f56655a9 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -248,6 +248,8 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT ".align 16 \n\t" ".L01LOOP%=: \n\t" + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y From c4d9d4e5f8319a743df17564c4bf1a1a0c3670e2 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 12:25:16 +0200 Subject: [PATCH 069/119] added haswell optimized kernel --- kernel/x86_64/sgemv_t_4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 3316473af..b0e883252 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,6 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" +#elif defined(HASWELL) +#include "sgemv_t_microk_haswell-4.c" #endif #define NBMAX 4096 From f511807fc07e4e62f07b4a880d3196b860796bec Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 12:27:32 +0200 Subject: [PATCH 070/119] modified multithreading threshold --- interface/gemv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/gemv.c b/interface/gemv.c index 64dc641d0..2dd82dce5 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order, int nthreads_avail = nthreads_max; double MNK = (double) m * (double) n; - if ( MNK <= (96.0 * 24.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) From 658939faaada12ab40334f986a665d28eef2ef19 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 15:22:35 +0200 Subject: [PATCH 071/119] optimized dgemv_n kernel for small sizes --- kernel/x86_64/KERNEL.NEHALEM | 2 +- kernel/x86_64/dgemv_n_4.c | 546 +++++++++++++++++++++++ kernel/x86_64/dgemv_n_microk_nehalem-4.c | 265 +++++++++++ 3 files changed, 812 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemv_n_4.c create mode 100644 kernel/x86_64/dgemv_n_microk_nehalem-4.c diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 68c741cea..8feef5c31 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -11,7 +11,7 @@ SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -DGEMVNKERNEL = dgemv_n.c +DGEMVNKERNEL = dgemv_n_4.c SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMINCOPY = gemm_ncopy_4.S diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c new file mode 100644 index 000000000..249df8009 --- /dev/null +++ b/kernel/x86_64/dgemv_n_4.c @@ -0,0 +1,546 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(NEHALEM) +#include "dgemv_n_microk_nehalem-4.c" +#endif + + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_4x8 + +static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT *b0,*b1,*b2,*b3; + FLOAT *x4; + FLOAT x[8]; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x4 = x + 4; + + for ( i=0; i<8; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + + y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3]; + y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3]; + y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3]; + y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3]; + + } +} + +#endif + + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4]; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2) , %%xmm12 \n\t" // x0 + "movsd (%6) , %%xmm4 \n\t" // alpha + "movsd 8(%2) , %%xmm13 \n\t" // x1 + "mulsd %%xmm4 , %%xmm12 \n\t" // alpha + "mulsd %%xmm4 , %%xmm13 \n\t" // alpha + "shufpd $0, %%xmm12, %%xmm12 \n\t" + "shufpd $0, %%xmm13, %%xmm13 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y + + "movups (%4,%0,8), %%xmm8 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + + "movups 16(%4,%0,8), %%xmm8 \n\t" + "movups 16(%5,%0,8), %%xmm9 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "addpd %%xmm8 , %%xmm5 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + + "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y + "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2), %%xmm12 \n\t" // x0 + "mulsd (%5), %%xmm12 \n\t" // alpha + "shufpd $0, %%xmm12, %%xmm12 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a + "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a + "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y + "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm12, %%xmm9 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + + "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y + "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (alpha) // 5 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-4.c b/kernel/x86_64/dgemv_n_microk_nehalem-4.c new file mode 100644 index 000000000..e311326f1 --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c @@ -0,0 +1,265 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2), %%xmm12 \n\t" // x0 + "movsd 8(%2), %%xmm13 \n\t" // x1 + "movsd 16(%2), %%xmm14 \n\t" // x2 + "movsd 24(%2), %%xmm15 \n\t" // x3 + "shufpd $0, %%xmm12, %%xmm12\n\t" + "shufpd $0, %%xmm13, %%xmm13\n\t" + "shufpd $0, %%xmm14, %%xmm14\n\t" + "shufpd $0, %%xmm15, %%xmm15\n\t" + + "movsd 32(%2), %%xmm0 \n\t" // x4 + "movsd 40(%2), %%xmm1 \n\t" // x5 + "movsd 48(%2), %%xmm2 \n\t" // x6 + "movsd 56(%2), %%xmm3 \n\t" // x7 + "shufpd $0, %%xmm0 , %%xmm0 \n\t" + "shufpd $0, %%xmm1 , %%xmm1 \n\t" + "shufpd $0, %%xmm2 , %%xmm2 \n\t" + "shufpd $0, %%xmm3 , %%xmm3 \n\t" + + "movsd (%9), %%xmm6 \n\t" // alpha + "shufpd $0, %%xmm6 , %%xmm6 \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y + + ".align 2 \n\t" + "movups (%4,%0,8), %%xmm8 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "movups (%6,%0,8), %%xmm10 \n\t" + "movups (%7,%0,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "movups (%4,%8,8), %%xmm8 \n\t" + "movups (%5,%8,8), %%xmm9 \n\t" + "movups (%6,%8,8), %%xmm10 \n\t" + "movups (%7,%8,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm0 , %%xmm8 \n\t" + "mulpd %%xmm1 , %%xmm9 \n\t" + "mulpd %%xmm2 , %%xmm10 \n\t" + "mulpd %%xmm3 , %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "addpd %%xmm5 , %%xmm4 \n\t" + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm7 \n\t" + + "movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y + + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y + + ".align 2 \n\t" + "movups 16(%4,%0,8), %%xmm8 \n\t" + "movups 16(%5,%0,8), %%xmm9 \n\t" + "movups 16(%6,%0,8), %%xmm10 \n\t" + "movups 16(%7,%0,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "movups 16(%4,%8,8), %%xmm8 \n\t" + "movups 16(%5,%8,8), %%xmm9 \n\t" + "movups 16(%6,%8,8), %%xmm10 \n\t" + "movups 16(%7,%8,8), %%xmm11 \n\t" + ".align 2 \n\t" + "mulpd %%xmm0 , %%xmm8 \n\t" + "mulpd %%xmm1 , %%xmm9 \n\t" + "mulpd %%xmm2 , %%xmm10 \n\t" + "mulpd %%xmm3 , %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm5 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5 \n\t" + + "addq $4 , %8 \n\t" + "addpd %%xmm5 , %%xmm4 \n\t" + "mulpd %%xmm6 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm7 \n\t" + + "movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "movsd (%2), %%xmm12 \n\t" // x0 + "movsd 8(%2), %%xmm13 \n\t" // x1 + "movsd 16(%2), %%xmm14 \n\t" // x2 + "movsd 24(%2), %%xmm15 \n\t" // x3 + "shufpd $0, %%xmm12, %%xmm12\n\t" + "shufpd $0, %%xmm13, %%xmm13\n\t" + "shufpd $0, %%xmm14, %%xmm14\n\t" + "shufpd $0, %%xmm15, %%xmm15\n\t" + + "movsd (%8), %%xmm6 \n\t" // alpha + "shufpd $0, %%xmm6 , %%xmm6 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y + + "movups (%4,%0,8), %%xmm8 \n\t" + "movups (%5,%0,8), %%xmm9 \n\t" + "movups (%6,%0,8), %%xmm10 \n\t" + "movups (%7,%0,8), %%xmm11 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + "addpd %%xmm10 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm11 \n\t" + + "mulpd %%xmm6 , %%xmm11 \n\t" + "addpd %%xmm7 , %%xmm11 \n\t" + "movups %%xmm11, (%3,%0,8) \n\t" // 2 * y + + "xorpd %%xmm4 , %%xmm4 \n\t" + "xorpd %%xmm5 , %%xmm5 \n\t" + "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y + + "movups 16(%4,%0,8), %%xmm8 \n\t" + "movups 16(%5,%0,8), %%xmm9 \n\t" + "movups 16(%6,%0,8), %%xmm10 \n\t" + "movups 16(%7,%0,8), %%xmm11 \n\t" + "mulpd %%xmm12, %%xmm8 \n\t" + "mulpd %%xmm13, %%xmm9 \n\t" + "mulpd %%xmm14, %%xmm10 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addpd %%xmm8 , %%xmm4 \n\t" + "addpd %%xmm9 , %%xmm4 \n\t" + "addpd %%xmm10 , %%xmm4 \n\t" + "addpd %%xmm4 , %%xmm11 \n\t" + + "mulpd %%xmm6 , %%xmm11 \n\t" + "addpd %%xmm7 , %%xmm11 \n\t" + "movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 7794766d3cce1011cc95120bcd8579f4fd62e193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20K=C3=B6hler?= Date: Mon, 8 Sep 2014 17:57:44 +0200 Subject: [PATCH 072/119] Add cblas_(s/d/c/z)omatcopy in order to have cblas interface for them. --- cblas.h | 10 ++++++++++ cblas_noconst.h | 8 ++++++++ exports/gensymbol | 3 ++- interface/Makefile | 22 ++++++++++++++++++---- interface/omatcopy.c | 21 +++++++++++++++++++++ interface/zomatcopy.c | 21 +++++++++++++++++++++ 6 files changed, 80 insertions(+), 5 deletions(-) diff --git a/cblas.h b/cblas.h index ef072e6ff..0cba45771 100644 --- a/cblas.h +++ b/cblas.h @@ -318,6 +318,16 @@ void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLA void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double *beta, double *y, OPENBLAS_CONST blasint incy); +void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, + OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb); +void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, + OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); +void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a, + OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb); +void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a, + OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); + + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/cblas_noconst.h b/cblas_noconst.h index 1f79e8188..1592abc7f 100644 --- a/cblas_noconst.h +++ b/cblas_noconst.h @@ -306,6 +306,14 @@ void cblas_caxpby(blasint n, float *alpha, float *x, blasint incx,float *beta, f void cblas_zaxpby(blasint n, double *alpha, double *x, blasint incx,double *beta, double *y, blasint incy); +void cblas_somatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a, + blasint clda, float *b, blasint cldb); +void cblas_domatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a, + blasint clda, double *b, blasint cldb); +void cblas_comatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a, + blasint clda, void *b, blasint cldb); +void cblas_zomatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a, + blasint clda, void *b, blasint cldb); #ifdef __cplusplus } diff --git a/exports/gensymbol b/exports/gensymbol index 0769ae0f3..17d8a2b9c 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -52,7 +52,8 @@ cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, - cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby + cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, + cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy ); @exblasobjs = ( diff --git a/interface/Makefile b/interface/Makefile index 6992248ba..e4822352f 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -267,7 +267,7 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ - cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) + cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ @@ -283,7 +283,7 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ - cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) + cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -305,7 +305,8 @@ CCBLAS2OBJS = \ CCBLAS3OBJS = \ cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ - cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) + cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ + cblas_comatcopy.$(SUFFIX) CZBLAS1OBJS = \ cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ @@ -327,7 +328,8 @@ CZBLAS2OBJS = \ CZBLAS3OBJS = \ cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ - cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX) + cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ + cblas_zomatcopy.$(SUFFIX) ifndef NO_CBLAS @@ -2035,15 +2037,27 @@ cblas_caxpby.$(SUFFIX) cblas_caxpby.$(PSUFFIX) : zaxpby.c domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) +cblas_domatcopy.$(SUFFIX) cblas_domatcopy.$(PSUFFIX) : omatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) +cblas_somatcopy.$(SUFFIX) cblas_somatcopy.$(PSUFFIX) : omatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) +cblas_comatcopy.$(SUFFIX) cblas_comatcopy.$(PSUFFIX) : zomatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) +cblas_zomatcopy.$(SUFFIX) cblas_zomatcopy.$(PSUFFIX) : zomatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) diff --git a/interface/omatcopy.c b/interface/omatcopy.c index 0c418b3c9..59650cfa0 100644 --- a/interface/omatcopy.c +++ b/interface/omatcopy.c @@ -47,6 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BlasNoTrans 0 #define BlasTrans 1 +#ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb) { @@ -66,7 +67,27 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, if ( Trans == 'R' ) trans = BlasNoTrans; if ( Trans == 'T' ) trans = BlasTrans; if ( Trans == 'C' ) trans = BlasTrans; +#else +void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, FLOAT *b, blasint cldb) +{ + blasint *rows, *cols, *lda, *ldb; + FLOAT *alpha; + int order=-1,trans=-1; + blasint info = -1; + if ( CORDER == CblasColMajor ) order = BlasColMajor; + if ( CORDER == CblasRowMajor ) order = BlasRowMajor; + + if ( CTRANS == CblasNoTrans || CTRANS == CblasConjNoTrans ) trans = BlasNoTrans; + if ( CTRANS == CblasTrans || CTRANS == CblasConjTrans ) trans = BlasTrans; + + rows = &crows; + cols = &ccols; + lda = &clda; + ldb = &cldb; + alpha = &calpha; + +#endif if ( order == BlasColMajor) { if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; diff --git a/interface/zomatcopy.c b/interface/zomatcopy.c index eec4d3c1c..7345633a2 100644 --- a/interface/zomatcopy.c +++ b/interface/zomatcopy.c @@ -49,6 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BlasTransConj 2 #define BlasConj 3 +#ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb) { @@ -69,6 +70,26 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, if ( Trans == 'C' ) trans = BlasTransConj; if ( Trans == 'R' ) trans = BlasConj; +#else +void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT *alpha, FLOAT *a, blasint clda, FLOAT*b, blasint cldb) +{ + blasint *rows, *cols, *lda, *ldb; + int order=-1,trans=-1; + blasint info = -1; + + if ( CORDER == CblasColMajor ) order = BlasColMajor; + if ( CORDER == CblasRowMajor ) order = BlasRowMajor; + + if ( CTRANS == CblasNoTrans) trans = BlasNoTrans; + if ( CTRANS == CblasConjNoTrans ) trans = BlasConj; + if ( CTRANS == CblasTrans) trans = BlasTrans; + if ( CTRANS == CblasConjTrans) trans = BlasTransConj; + + rows = &crows; + cols = &ccols; + lda = &clda; + ldb = &cldb; +#endif if ( order == BlasColMajor) { if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; From cd34e9701b552bff542c335943aec70e159037ba Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 8 Sep 2014 19:15:31 +0200 Subject: [PATCH 073/119] removed obsolete files --- kernel/x86_64/sgemv_n_avx.c | 218 ----------- kernel/x86_64/sgemv_n_microk_bulldozer.c | 451 --------------------- kernel/x86_64/sgemv_n_microk_haswell.c | 461 ---------------------- kernel/x86_64/sgemv_n_microk_sandy.c | 473 ----------------------- kernel/x86_64/sgemv_t_avx.c | 232 ----------- kernel/x86_64/sgemv_t_microk_bulldozer.c | 99 ----- kernel/x86_64/sgemv_t_microk_haswell.c | 100 ----- kernel/x86_64/sgemv_t_microk_sandy.c | 106 ----- 8 files changed, 2140 deletions(-) delete mode 100644 kernel/x86_64/sgemv_n_avx.c delete mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer.c delete mode 100644 kernel/x86_64/sgemv_n_microk_haswell.c delete mode 100644 kernel/x86_64/sgemv_n_microk_sandy.c delete mode 100644 kernel/x86_64/sgemv_t_avx.c delete mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer.c delete mode 100644 kernel/x86_64/sgemv_t_microk_haswell.c delete mode 100644 kernel/x86_64/sgemv_t_microk_sandy.c diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c deleted file mode 100644 index 57aaad4b4..000000000 --- a/kernel/x86_64/sgemv_n_avx.c +++ /dev/null @@ -1,218 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" - -#if defined(BULLDOZER) || defined(PILEDRIVER) -#include "sgemv_n_microk_bulldozer.c" -#elif defined(HASWELL) -#include "sgemv_n_microk_haswell.c" -#else -#include "sgemv_n_microk_sandy.c" -#endif - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for ( i=0; i 0 ) - { - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(n2,x_ptr,xbuffer,inc_x); - - a_ptr = a + n1 * 512 * lda; - y_ptr = y; - - for(i = 0; i rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero - "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vfmaddps %%ymm8 , 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vfmaddps %%ymm9 , 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp - "prefetcht0 128(%%r8)\n\t" // Prefetch - "vfmaddps %%ymm10, 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm11, 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp - "prefetcht0 192(%%r8)\n\t" // Prefetch - "vfmaddps %%ymm12, 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm13, 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm14, 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm15, 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha - "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" // set to zero - "vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" // set to zero - "vxorps %%xmm10, %%xmm10, %%xmm10\n\t" // set to zero - "vxorps %%xmm11, %%xmm11, %%xmm11\n\t" // set to zero - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vfmaddps %%xmm8 , 0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vfmaddps %%xmm9 , 4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp - "vfmaddps %%xmm10, 8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm11, 12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm12, 16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm13, 20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm14, 24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm15, 28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm8 , %%xmm1, %%xmm8 \n\t" // scale by alpha - "vmulps %%xmm9 , %%xmm1, %%xmm9 \n\t" // scale by alpha - "vmulps %%xmm10, %%xmm1, %%xmm10\n\t" // scale by alpha - "vmulps %%xmm11, %%xmm1, %%xmm11\n\t" // scale by alpha - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulps %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - "vmulps %%xmm14, %%xmm1, %%xmm14\n\t" // scale by alpha - "vmulps %%xmm15, %%xmm1, %%xmm15\n\t" // scale by alpha - - "vmovups %%xmm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%xmm9 , 4*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm10, 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm11, 12*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm12, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm13, 20*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm14, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%xmm15, 28*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - ); - -} - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - "vfmaddps %%ymm13, 8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - - "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 8*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - -static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - - "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - -static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddps %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - -static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - "vfmaddss %%xmm13, 1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/sgemv_n_microk_haswell.c b/kernel/x86_64/sgemv_n_microk_haswell.c deleted file mode 100644 index 9db3869d2..000000000 --- a/kernel/x86_64/sgemv_n_microk_haswell.c +++ /dev/null @@ -1,461 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*2; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero - "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vfmadd231ps 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp - "vfmadd231ps 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vfmadd231ps 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp - "vfmadd231ps 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp - "prefetcht0 128(%%r8)\n\t" // Prefetch - "vfmadd231ps 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp - "vfmadd231ps 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp - "prefetcht0 192(%%r8)\n\t" // Prefetch - "vfmadd231ps 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp - "vfmadd231ps 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha - "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp - "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp - - - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - - -} - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - -static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp - - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/sgemv_n_microk_sandy.c b/kernel/x86_64/sgemv_n_microk_sandy.c deleted file mode 100644 index 9bdb06600..000000000 --- a/kernel/x86_64/sgemv_n_microk_sandy.c +++ /dev/null @@ -1,473 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*2; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero - "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero - "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero - "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "prefetcht0 64(%%r8)\n\t" // Prefetch - "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - "prefetcht0 128(%%r8)\n\t" // Prefetch - "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp - "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp - - "prefetcht0 192(%%r8)\n\t" // Prefetch - "vmulps 32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "vmulps 48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp - "vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp - "vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp - "vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha - "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha - "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha - "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero - "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp - "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp - "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp - - - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha - "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y - "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - - -} - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - float *pre = a + lda*3; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - "movq %6, %%r8\n\t" // address for prefetch - "prefetcht0 (%%r8)\n\t" // Prefetch - "prefetcht0 64(%%r8)\n\t" // Prefetch - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - "nop \n\t" - "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch - - "prefetcht0 (%%r8)\n\t" // Prefetch - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp - - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha - - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y), // 5 - "m" (pre) // 6 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp - "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha - "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "memory" - ); - - -} - - -static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - -static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - - "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp - - "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp - - "addq $4 , %%rdi \n\t" // increment pointer of c - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - - ".L01LOOP%=: \n\t" - "vmovss (%%rdi), %%xmm0 \n\t" // load values of c - "addq $4 , %%rdi \n\t" // increment pointer of c - - "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp - "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp - - "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a - - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha - - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/sgemv_t_avx.c b/kernel/x86_64/sgemv_t_avx.c deleted file mode 100644 index 55fb3d623..000000000 --- a/kernel/x86_64/sgemv_t_avx.c +++ /dev/null @@ -1,232 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" - -#if defined(BULLDOZER) || defined(PILEDRIVER) -#include "sgemv_t_microk_bulldozer.c" -#elif defined(HASWELL) -#include "sgemv_t_microk_haswell.c" -#else -#include "sgemv_t_microk_sandy.c" -#endif - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for ( i=0; i= 16 ) - { - if ( m2 & Mblock) - { - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(Mblock,x_ptr,xbuffer,inc_x); - - y_ptr = y; - a_ptrl = a_ptr; - - for(i = 0; i rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float - "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - - "sarq $4, %%rax \n\t" // n = n / 16 - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - // "prefetcht0 512(%%rsi) \n\t" - "prefetcht0 (%%r8) \n\t" //prefetch next line of a - "vmovups (%%rsi), %%xmm4 \n\t" - "vmovups 4*4(%%rsi), %%xmm5 \n\t" - "vmovups 8*4(%%rsi), %%xmm6 \n\t" - "vmovups 12*4(%%rsi), %%xmm7 \n\t" - - "vfmaddps %%xmm12, 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm13, 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm14, 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp - "vfmaddps %%xmm15, 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp - - "addq $16*4 , %%r8 \n\t" // increment prefetch pointer - "addq $16*4 , %%rsi \n\t" // increment pointer of a - "addq $16*4 , %%rdi \n\t" // increment pointer of c - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" - "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - - "vfmaddss (%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t" - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - diff --git a/kernel/x86_64/sgemv_t_microk_haswell.c b/kernel/x86_64/sgemv_t_microk_haswell.c deleted file mode 100644 index ecb9845bb..000000000 --- a/kernel/x86_64/sgemv_t_microk_haswell.c +++ /dev/null @@ -1,100 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - //n = n / 16; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float - "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - - "sarq $4, %%rax \n\t" // n = n / 16 - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - // "prefetcht0 512(%%rsi) \n\t" - "prefetcht0 (%%r8) \n\t" //prefetch next line of a - "vmovups (%%rsi), %%xmm4 \n\t" - "vmovups 4*4(%%rsi), %%xmm5 \n\t" - "vmovups 8*4(%%rsi), %%xmm6 \n\t" - "vmovups 12*4(%%rsi), %%xmm7 \n\t" - - "vfmadd231ps 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp - "vfmadd231ps 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp - "vfmadd231ps 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp - "vfmadd231ps 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp - - "addq $16*4 , %%r8 \n\t" // increment prefetch pointer - "addq $16*4 , %%rsi \n\t" // increment pointer of a - "addq $16*4 , %%rdi \n\t" // increment pointer of c - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" - "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" - "vaddss (%%rdx), %%xmm12,%%xmm12\n\t" - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - diff --git a/kernel/x86_64/sgemv_t_microk_sandy.c b/kernel/x86_64/sgemv_t_microk_sandy.c deleted file mode 100644 index 4ecd6d3d0..000000000 --- a/kernel/x86_64/sgemv_t_microk_sandy.c +++ /dev/null @@ -1,106 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) -{ - - //n = n / 16; - - __asm__ __volatile__ - ( - "movq %0, %%rax\n\t" // n -> rax - "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 - "movq %2, %%rsi\n\t" // adress of a -> rsi - "movq %3, %%rcx\n\t" // value of lda > rcx - "movq %4, %%rdi\n\t" // adress of x -> rdi - "movq %5, %%rdx\n\t" // adress of y -> rdx - - "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float - "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line - - "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero - "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero - "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero - "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero - - "sarq $4, %%rax \n\t" // n = n / 16 - - ".align 16 \n\t" - ".L01LOOP%=: \n\t" - // "prefetcht0 512(%%rsi) \n\t" - "prefetcht0 (%%r8) \n\t" //prefetch next line of a - "vmovups (%%rsi), %%xmm4 \n\t" - "vmovups 4*4(%%rsi), %%xmm5 \n\t" - "vmovups 8*4(%%rsi), %%xmm6 \n\t" - "vmovups 12*4(%%rsi), %%xmm7 \n\t" - - "vmulps 0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp - "vmulps 4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp - "vmulps 8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp - "vmulps 12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp - - "vaddps %%xmm12, %%xmm8 , %%xmm12\n\t" - "vaddps %%xmm13, %%xmm9 , %%xmm13\n\t" - "vaddps %%xmm14, %%xmm10, %%xmm14\n\t" - "vaddps %%xmm15, %%xmm11, %%xmm15\n\t" - - "addq $16*4 , %%r8 \n\t" // increment prefetch pointer - "addq $16*4 , %%rsi \n\t" // increment pointer of a - "addq $16*4 , %%rdi \n\t" // increment pointer of c - "dec %%rax \n\t" // n = n -1 - "jnz .L01LOOP%= \n\t" - - "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" - "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" - "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" - - "vmulss %%xmm12, %%xmm1, %%xmm12 \n\t" - "vaddss (%%rdx), %%xmm12, %%xmm12\n\t" - "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y - - : - : - "m" (n), // 0 - "m" (alpha), // 1 - "m" (a), // 2 - "m" (lda), // 3 - "m" (x), // 4 - "m" (y) // 5 - : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", - "%xmm0", "%xmm1", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - From a057e5434dbdc55141bf90556c7ef100eb2e21a9 Mon Sep 17 00:00:00 2001 From: Martin Koehler Date: Tue, 9 Sep 2014 09:52:13 +0200 Subject: [PATCH 074/119] add CBLAS interface for s/d/c/zimatcopy --- cblas.h | 8 ++++++++ cblas_noconst.h | 8 ++++++++ exports/gensymbol | 3 ++- interface/Makefile | 22 ++++++++++++++++++---- interface/imatcopy.c | 23 +++++++++++++++++++++++ interface/zimatcopy.c | 26 ++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 5 deletions(-) diff --git a/cblas.h b/cblas.h index 0cba45771..2d46049d2 100644 --- a/cblas.h +++ b/cblas.h @@ -327,6 +327,14 @@ void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a, OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); +void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, + OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); +void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, + OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); +void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a, + OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); +void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, + OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); #ifdef __cplusplus } diff --git a/cblas_noconst.h b/cblas_noconst.h index 1592abc7f..884d9f63f 100644 --- a/cblas_noconst.h +++ b/cblas_noconst.h @@ -315,6 +315,14 @@ void cblas_comatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, bl void cblas_zomatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a, blasint clda, void *b, blasint cldb); +void cblas_simatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a, + blasint clda, blasint cldb); +void cblas_dimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a, + blasint clda, blasint cldb); +void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float* calpha, float* a, + blasint clda, blasint cldb); +void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a, + blasint clda, blasint cldb); #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/exports/gensymbol b/exports/gensymbol index 17d8a2b9c..e5049678a 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -53,7 +53,8 @@ cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, - cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy + cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, + cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy ); @exblasobjs = ( diff --git a/interface/Makefile b/interface/Makefile index e4822352f..cced14fb2 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -267,7 +267,7 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ - cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) + cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX) CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ @@ -283,7 +283,7 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ - cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) + cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -306,7 +306,8 @@ CCBLAS3OBJS = \ cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ - cblas_comatcopy.$(SUFFIX) + cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX) + CZBLAS1OBJS = \ cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ @@ -329,7 +330,8 @@ CZBLAS3OBJS = \ cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ - cblas_zomatcopy.$(SUFFIX) + cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) + ifndef NO_CBLAS @@ -2061,13 +2063,25 @@ cblas_zomatcopy.$(SUFFIX) cblas_zomatcopy.$(PSUFFIX) : zomatcopy.c dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) +cblas_dimatcopy.$(SUFFIX) cblas_dimatcopy.$(PSUFFIX) : imatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) +cblas_simatcopy.$(SUFFIX) cblas_simatcopy.$(PSUFFIX) : imatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) +cblas_cimatcopy.$(SUFFIX) cblas_cimatcopy.$(PSUFFIX) : zimatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) +cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 3bc886f4f..89f0ec823 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -50,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef malloc #undef free +#ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) { @@ -71,6 +72,28 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, if ( Trans == 'R' ) trans = BlasNoTrans; if ( Trans == 'T' ) trans = BlasTrans; if ( Trans == 'C' ) trans = BlasTrans; +#else +void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, blasint cldb) +{ + char Order, Trans; + int order=-1,trans=-1; + blasint info = -1; + FLOAT *b; + size_t msize; + blasint *lda, *ldb, *rows, *cols; + FLOAT *alpha; + + if ( CORDER == CblasColMajor) order = BlasColMajor; + if ( CORDER == CblasRowMajor) order = BlasRowMajor; + if ( CTRANS == CblasNoTrans || CTRANS == CblasConjNoTrans) trans = BlasNoTrans; + if ( CTRANS == CblasTrans || CTRANS == CblasConjTrans ) trans = BlasTrans; + + rows = &crows; + cols = &ccols; + alpha = &calpha; + lda = &clda; + ldb = &cldb; +#endif if ( order == BlasColMajor) { diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 79af6d760..3f273cf13 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -49,6 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BlasTransConj 2 #define BlasConj 3 + +#ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) { @@ -71,6 +73,30 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, if ( Trans == 'C' ) trans = BlasTransConj; if ( Trans == 'R' ) trans = BlasConj; +#else +void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT *alpha, FLOAT *a, blasint clda, blasint cldb) +{ + + blasint *rows, *cols, *lda, *ldb; + int order=-1,trans=-1; + blasint info = -1; + FLOAT *b; + size_t msize; + + if ( CORDER == CblasColMajor ) order = BlasColMajor; + if ( CORDER == CblasRowMajor ) order = BlasRowMajor; + + if ( CTRANS == CblasNoTrans) trans = BlasNoTrans; + if ( CTRANS == CblasConjNoTrans ) trans = BlasConj; + if ( CTRANS == CblasTrans) trans = BlasTrans; + if ( CTRANS == CblasConjTrans) trans = BlasTransConj; + + rows = &crows; + cols = &ccols; + lda = &clda; + ldb = &cldb; +#endif + if ( order == BlasColMajor) { if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; From 44f2bf9bae7b25356fc0179d6b935de4edadc637 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 13:34:22 +0200 Subject: [PATCH 075/119] added optimized dgemv_t kernel for haswell --- kernel/x86_64/dgemv_t_4.c | 623 +++++++++++++++++++++++ kernel/x86_64/dgemv_t_microk_haswell-4.c | 127 +++++ 2 files changed, 750 insertions(+) create mode 100644 kernel/x86_64/dgemv_t_4.c create mode 100644 kernel/x86_64/dgemv_t_microk_haswell-4.c diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c new file mode 100644 index 000000000..0d0409bec --- /dev/null +++ b/kernel/x86_64/dgemv_t_4.c @@ -0,0 +1,623 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +/* +#if defined(NEHALEM) +#include "dgemv_t_microk_nehalem-4.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) +#include "dgemv_t_microk_bulldozer-4.c" +#elif defined(SANDYBRIDGE) +#include "dgemv_t_microk_sandy-4.c" +#elif defined(HASWELL) +#include "dgemv_t_microk_haswell-4.c" +#endif +*/ + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; +} + +#endif + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorpd %%xmm10 , %%xmm10 \n\t" + "xorpd %%xmm11 , %%xmm11 \n\t" + + "testq $2 , %1 \n\t" + "jz .L01LABEL%= \n\t" + + "movups (%5,%0,8) , %%xmm14 \n\t" // x + "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 + "mulpd %%xmm14 , %%xmm12 \n\t" + "mulpd %%xmm14 , %%xmm13 \n\t" + "addq $2 , %0 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "subq $2 , %1 \n\t" + "addpd %%xmm13 , %%xmm11 \n\t" + + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%5,%0,8) , %%xmm14 \n\t" // x + "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 + "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 + "mulpd %%xmm14 , %%xmm12 \n\t" + "mulpd %%xmm14 , %%xmm13 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "addpd %%xmm13 , %%xmm11 \n\t" + + "movups 16(%5,%0,8) , %%xmm14 \n\t" // x + "movups 16(%3,%0,8) , %%xmm12 \n\t" // ap0 + "movups 16(%4,%0,8) , %%xmm13 \n\t" // ap1 + "mulpd %%xmm14 , %%xmm12 \n\t" + "mulpd %%xmm14 , %%xmm13 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "addpd %%xmm13 , %%xmm11 \n\t" + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "haddpd %%xmm10, %%xmm10 \n\t" + "haddpd %%xmm11, %%xmm11 \n\t" + + "movsd %%xmm10, (%2) \n\t" + "movsd %%xmm11,8(%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 + "r" (x) // 5 + : "cc", + "%xmm4", "%xmm5", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + i=0; + + __asm__ __volatile__ + ( + "xorpd %%xmm9 , %%xmm9 \n\t" + "xorpd %%xmm10 , %%xmm10 \n\t" + + "testq $2 , %1 \n\t" + "jz .L01LABEL%= \n\t" + + "movups (%3,%0,8) , %%xmm12 \n\t" + "movups (%4,%0,8) , %%xmm11 \n\t" + "mulpd %%xmm11 , %%xmm12 \n\t" + "addq $2 , %0 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "subq $2 , %1 \n\t" + + ".L01LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "movups (%3,%0,8) , %%xmm12 \n\t" + "movups 16(%3,%0,8) , %%xmm14 \n\t" + "movups (%4,%0,8) , %%xmm11 \n\t" + "movups 16(%4,%0,8) , %%xmm13 \n\t" + "mulpd %%xmm11 , %%xmm12 \n\t" + "mulpd %%xmm13 , %%xmm14 \n\t" + "addq $4 , %0 \n\t" + "addpd %%xmm12 , %%xmm10 \n\t" + "subq $4 , %1 \n\t" + "addpd %%xmm14 , %%xmm9 \n\t" + + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "addpd %%xmm9 , %%xmm10 \n\t" + "haddpd %%xmm10, %%xmm10 \n\t" + + "movsd %%xmm10, (%2) \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 + : "cc", + "%xmm9", "%xmm10" , + "%xmm11", "%xmm12", "%xmm13", "%xmm14", + "memory" + ); + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j Date: Tue, 9 Sep 2014 13:54:55 +0200 Subject: [PATCH 076/119] added optimized gemv kernels --- kernel/x86_64/KERNEL.HASWELL | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index d0ac9c72f..8aab560c4 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,8 +1,8 @@ -SGEMVNKERNEL = sgemv_n.c -SGEMVTKERNEL = sgemv_t.c +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c -DGEMVNKERNEL = dgemv_n.c -DGEMVTKERNEL = dgemv_t.c +DGEMVNKERNEL = dgemv_n_4.c +DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c ZGEMVTKERNEL = zgemv_t.c From debc6d1a056f7d2763dfb4de01cf1b4780a2536b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 14:04:44 +0200 Subject: [PATCH 077/119] bugfix in KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 8aab560c4..9a5c54ffc 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,7 +1,7 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -DGEMVNKERNEL = dgemv_n_4.c +DGEMVNKERNEL = dgemv_n.c DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c From 8109d8232c66c4119044fa3111947d88afae9eb6 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 14:38:08 +0200 Subject: [PATCH 078/119] optimized dgemv_t kernel for haswell --- kernel/x86_64/dgemv_t_4.c | 10 +--------- kernel/x86_64/dgemv_t_microk_haswell-4.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 0d0409bec..ebec7d2c3 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,17 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/* -#if defined(NEHALEM) -#include "dgemv_t_microk_nehalem-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) -#include "dgemv_t_microk_bulldozer-4.c" -#elif defined(SANDYBRIDGE) -#include "dgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) +#if defined(HASWELL) #include "dgemv_t_microk_haswell-4.c" #endif -*/ #define NBMAX 2048 diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c index 410225500..33b43515d 100644 --- a/kernel/x86_64/dgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c @@ -61,25 +61,25 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "prefetcht0 384(%2,%0,8) \n\t" + // "prefetcht0 384(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x - "prefetcht0 384(%4,%0,8) \n\t" + // "prefetcht0 384(%4,%0,8) \n\t" "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t" - "prefetcht0 384(%5,%0,8) \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "prefetcht0 384(%6,%0,8) \n\t" + // "prefetcht0 384(%5,%0,8) \n\t" "vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t" "vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t" - "prefetcht0 384(%7,%0,8) \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7 \n\t" + // "prefetcht0 384(%6,%0,8) \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "addq $8 , %0 \n\t" + // "prefetcht0 384(%7,%0,8) \n\t" + "vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t" + "subq $8 , %1 \n\t" + "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" "jnz .L01LOOP%= \n\t" ".L16END%=: \n\t" From faab7a181d72023c11b098da9cabc49a2ae3701d Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 15:32:32 +0200 Subject: [PATCH 079/119] added optimized dgemv_n kernel for haswell --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 247 +++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 kernel/x86_64/dgemv_n_microk_haswell-4.c diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c new file mode 100644 index 000000000..2c77f3469 --- /dev/null +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 + + "vbroadcastsd (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L8LABEL%= \n\t" + + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %8 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L8LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L16END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "addq $8 , %8 \n\t" + "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y + "subq $8 , %1 \n\t" + "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y + + "jnz .L01LOOP%= \n\t" + + ".L16END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + + "vbroadcastsd (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz .L8LABEL%= \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L8LABEL%=: \n\t" + + "cmpq $0, %1 \n\t" + "je .L8END%= \n\t" + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L8END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From baa46e4fba439a4dea4eed9fe82d0cd164f77a5a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 9 Sep 2014 16:17:45 +0200 Subject: [PATCH 080/119] added and tested optimized dgemv_n kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/dgemv_n_4.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 9a5c54ffc..8aab560c4 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,7 +1,7 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -DGEMVNKERNEL = dgemv_n.c +DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n.c diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 249df8009..371fd73ee 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" +#elif defined(HASWELL) +#include "dgemv_n_microk_haswell-4.c" #endif From 4aa534ae93ee50558c07172beaea68964a356926 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 10 Sep 2014 13:45:13 +0200 Subject: [PATCH 081/119] added cgemv_n kernel, optimized for small sizes --- kernel/x86_64/cgemv_n_4.c | 531 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 531 insertions(+) create mode 100644 kernel/x86_64/cgemv_n_4.c diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c new file mode 100644 index 000000000..162b7d99f --- /dev/null +++ b/kernel/x86_64/cgemv_n_4.c @@ -0,0 +1,531 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#if defined(HASWELL) +#include "cgemv_n_microk_haswell-2.c" +#endif + + +#define NBMAX 2048 + +#ifndef HAVE_KERNEL_4x4 + +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; + y[i] += a2[i]*x[4] - a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; + y[i] += a3[i]*x[6] - a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; + y[i] += a2[i]*x[4] + a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; + y[i] += a3[i]*x[6] + a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; +#endif + } +} + +#endif + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; +#endif + + } +} + + +static void zero_y(BLASLONG n, FLOAT *dest) +{ + BLASLONG i; + for ( i=0; i<2*n; i++ ) + { + *dest = 0.0; + dest++; + } +} + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i Date: Wed, 10 Sep 2014 14:11:24 +0200 Subject: [PATCH 082/119] added optimized cgemv_kernel for haswell --- kernel/x86_64/cgemv_n_microk_haswell-4.c | 182 +++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 kernel/x86_64/cgemv_n_microk_haswell-4.c diff --git a/kernel/x86_64/cgemv_n_microk_haswell-4.c b/kernel/x86_64/cgemv_n_microk_haswell-4.c new file mode 100644 index 000000000..991c1dc59 --- /dev/null +++ b/kernel/x86_64/cgemv_n_microk_haswell-4.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 + "vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2 + "vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2 + "vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3 + "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3 + + "cmpq $0 , %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 320(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 + + "prefetcht0 320(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 320(%6,%0,4) \n\t" + "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2 + "vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2 + + "vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmadd231ps %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmadd231ps %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 320(%7,%0,4) \n\t" + "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3 + "vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3 + + "vfmadd231ps %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231ps %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmadd231ps %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmadd231ps %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmadd231ps %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231ps %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmadd231ps %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmadd231ps %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 320(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%ymm10 \n\t" + "vmovups 32(%3,%0,4), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13, 32(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "cmpq $4, %8 \n\t" + "jne .L02END%= \n\t" + + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2 + "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3 + + "vfmadd231ps %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231ps %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmadd231ps %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231ps %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%3,%0,4), %%ymm10 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + + ".L02END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (n2) // 8 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From f98e1244c45dd446275829064692274c7df5a90a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 10 Sep 2014 19:26:14 +0200 Subject: [PATCH 083/119] optimized cgemv_n_4.c --- kernel/x86_64/cgemv_n_4.c | 138 +++++++++++++++++++++++++++++++------- 1 file changed, 113 insertions(+), 25 deletions(-) diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 162b7d99f..2a5afd5b3 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(HASWELL) -#include "cgemv_n_microk_haswell-2.c" +#include "cgemv_n_microk_haswell-4.c" #endif @@ -73,6 +73,41 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) #endif + + +#ifndef HAVE_KERNEL_4x2 + +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; +#endif + } +} + +#endif + + + + +#ifndef HAVE_KERNEL_4x1 + + static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; @@ -91,41 +126,85 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) } } - - -static void zero_y(BLASLONG n, FLOAT *dest) -{ - BLASLONG i; - for ( i=0; i<2*n; i++ ) - { - *dest = 0.0; - dest++; - } -} +#endif + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i Date: Thu, 11 Sep 2014 10:25:48 +0200 Subject: [PATCH 084/119] more optimizations --- kernel/x86_64/cgemv_n_4.c | 4 + kernel/x86_64/cgemv_n_microk_haswell-4.c | 360 +++++++++++++++++++++++ 2 files changed, 364 insertions(+) diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 2a5afd5b3..ff8058549 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -131,6 +131,8 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) #endif +#ifndef HAVE_KERNEL_ADDY + static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) @@ -207,6 +209,8 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a } +#endif + int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; diff --git a/kernel/x86_64/cgemv_n_microk_haswell-4.c b/kernel/x86_64/cgemv_n_microk_haswell-4.c index 991c1dc59..40a8432c4 100644 --- a/kernel/x86_64/cgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/cgemv_n_microk_haswell-4.c @@ -180,3 +180,363 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) } +#define HAVE_KERNEL_4x2 1 +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 + + "cmpq $0 , %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 320(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 + + "prefetcht0 320(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmadd231ps %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmadd231ps %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 320(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%ymm10 \n\t" + "vmovups 32(%3,%0,4), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13, 32(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "cmpq $4, %6 \n\t" + "jne .L02END%= \n\t" + + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%3,%0,4), %%ymm10 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + + ".L02END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (n2) // 6 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x1 1 +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 + + "cmpq $0 , %1 \n\t" + "je .L01END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 320(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 320(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%ymm10 \n\t" + "vmovups 32(%3,%0,4), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13, 32(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L01END%=: \n\t" + + "cmpq $4, %5 \n\t" + "jne .L02END%= \n\t" + + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%3,%0,4), %%ymm10 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + + ".L02END%=: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (n2) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_ADDY 1 + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i Date: Thu, 11 Sep 2014 11:12:44 +0200 Subject: [PATCH 085/119] bufix in cgemv_n_microk_haswell-4.c --- kernel/x86_64/cgemv_n_microk_haswell-4.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/x86_64/cgemv_n_microk_haswell-4.c b/kernel/x86_64/cgemv_n_microk_haswell-4.c index 40a8432c4..24417ba36 100644 --- a/kernel/x86_64/cgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/cgemv_n_microk_haswell-4.c @@ -351,14 +351,14 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" #endif + "addq $16, %0 \n\t" "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" - "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - "vmovups %%ymm13, 32(%3,%0,4) \n\t" - - "addq $16, %0 \n\t" "subq $8 , %1 \n\t" + "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13,-32(%3,%0,4) \n\t" + "jnz .L01LOOP%= \n\t" ".L01END%=: \n\t" @@ -481,14 +481,14 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" #endif + "addq $16, %0 \n\t" "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" - "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y - "vmovups %%ymm13, 32(%3,%0,4) \n\t" - - "addq $16, %0 \n\t" "subq $8 , %1 \n\t" + "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13,-32(%3,%0,4) \n\t" + "jnz .L01LOOP%= \n\t" ".L01END%=: \n\t" @@ -503,7 +503,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vmovups (%3,%0,4), %%ymm10 \n\t" -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#if !defined(XCONJ) "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" #else From bced4594bbee999af84a5c99cdf8835fbc612a46 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 11 Sep 2014 12:34:57 +0200 Subject: [PATCH 086/119] added optimized zgemv_n kernel --- kernel/x86_64/zgemv_n_4.c | 623 +++++++++++++++++++++++ kernel/x86_64/zgemv_n_microk_haswell-4.c | 297 +++++++++++ 2 files changed, 920 insertions(+) create mode 100644 kernel/x86_64/zgemv_n_4.c create mode 100644 kernel/x86_64/zgemv_n_microk_haswell-4.c diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c new file mode 100644 index 000000000..43adb6c05 --- /dev/null +++ b/kernel/x86_64/zgemv_n_4.c @@ -0,0 +1,623 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#if defined(HASWELL) +#include "zgemv_n_microk_haswell-4.c" +#endif + + +#define NBMAX 1024 + +#ifndef HAVE_KERNEL_4x4 + +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; + y[i] += a2[i]*x[4] - a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; + y[i] += a3[i]*x[6] - a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; + y[i] += a2[i]*x[4] + a2[i+1] * x[5]; + y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; + y[i] += a3[i]*x[6] + a3[i+1] * x[7]; + y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; +#endif + } +} + +#endif + + + +#ifndef HAVE_KERNEL_4x2 + +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; + y[i] += a1[i]*x[2] - a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; + y[i] += a1[i]*x[2] + a1[i+1] * x[3]; + y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; +#endif + } +} + +#endif + + + + +#ifndef HAVE_KERNEL_4x1 + + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i]*x[0] - a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; +#else + y[i] += a0[i]*x[0] + a0[i+1] * x[1]; + y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; +#endif + + } +} + + +#endif + + +#ifndef HAVE_KERNEL_ADDY + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i Date: Thu, 11 Sep 2014 13:18:00 +0200 Subject: [PATCH 087/119] bugfix in zgemv_n_4.c --- kernel/x86_64/KERNEL.HASWELL | 4 ++-- kernel/x86_64/zgemv_n_4.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 8aab560c4..fc4a4c7f8 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -4,10 +4,10 @@ SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c -ZGEMVNKERNEL = zgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t.c -CGEMVNKERNEL = cgemv_n.c +CGEMVNKERNEL = cgemv_n_4.c CGEMVTKERNEL = cgemv_t.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 43adb6c05..c9b5b570e 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -29,6 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" + #if defined(HASWELL) #include "zgemv_n_microk_haswell-4.c" #endif @@ -270,7 +271,7 @@ printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x ap[3] = ap[2] + lda; x_ptr = x; //zero_y(NB,ybuffer); - memset(ybuffer,0,NB*8); + memset(ybuffer,0,NB*16); if ( inc_x == 2 ) { From bb8d698335080383e50947f44cd5dac136c130a2 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 11 Sep 2014 13:44:55 +0200 Subject: [PATCH 088/119] optimized zgemv_n_microk_haswell-4.c for small size --- kernel/x86_64/zgemv_n_microk_haswell-4.c | 103 +++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/kernel/x86_64/zgemv_n_microk_haswell-4.c b/kernel/x86_64/zgemv_n_microk_haswell-4.c index fd6fb5027..61358508a 100644 --- a/kernel/x86_64/zgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/zgemv_n_microk_haswell-4.c @@ -295,3 +295,106 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) + +#define HAVE_KERNEL_ADDY 1 + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i Date: Fri, 12 Sep 2014 12:35:20 +0200 Subject: [PATCH 089/119] added optimized zgemv_t routine --- kernel/x86_64/zgemv_t_4.c | 492 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 492 insertions(+) create mode 100644 kernel/x86_64/zgemv_t_4.c diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c new file mode 100644 index 000000000..3ffcf64b4 --- /dev/null +++ b/kernel/x86_64/zgemv_t_4.c @@ -0,0 +1,492 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +/* +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "zgemv_t_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "zgemv_t_microk_haswell-2.c" +#endif +*/ + +#define NBMAX 1028 + +#ifndef HAVE_KERNEL_4x4 + +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_r3 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_i2 = 0.0; + FLOAT temp_i3 = 0.0; + + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif +} + +#endif + +#ifndef HAVE_KERNEL_4x1 + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +#endif + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i Date: Fri, 12 Sep 2014 13:38:01 +0200 Subject: [PATCH 090/119] added optimized cgemv_t kernel --- kernel/x86_64/cgemv_t_4.c | 583 ++++++++++++++++++++++++++++++++++++++ kernel/x86_64/zgemv_t_4.c | 171 ++++++++--- 2 files changed, 714 insertions(+), 40 deletions(-) create mode 100644 kernel/x86_64/cgemv_t_4.c diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c new file mode 100644 index 000000000..f8a471ec3 --- /dev/null +++ b/kernel/x86_64/cgemv_t_4.c @@ -0,0 +1,583 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +/* +#if defined(BULLDOZER) || defined(PILEDRIVER) +#include "zgemv_t_microk_bulldozer-2.c" +#elif defined(HASWELL) +#include "zgemv_t_microk_haswell-2.c" +#endif +*/ + +#define NBMAX 1028 + +#ifndef HAVE_KERNEL_4x4 + +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_r3 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_i2 = 0.0; + FLOAT temp_i3 = 0.0; + + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; + temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; + temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; + temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; + temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif +} + +#endif + + +#ifndef HAVE_KERNEL_4x1 + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0; + a0 = ap; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +#endif + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m % (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + a_ptr += lda; + y_ptr += 2; + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,64); + zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,64); + zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); + + x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; + + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return(0); + } + + + if ( m3 == 1 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + return(0); + + +} + + diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 3ffcf64b4..f8a471ec3 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -110,6 +110,57 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT * #endif +#ifndef HAVE_KERNEL_4x2 + +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + FLOAT alpha_r = alpha[0]; + FLOAT alpha_i = alpha[1]; + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + + + for ( i=0; i< 2*n; i+=2 ) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; +#else + temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; + temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; + temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; + temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; +#endif + } + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif +} + +#endif + + #ifndef HAVE_KERNEL_4x1 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) @@ -184,19 +235,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( m < 1 ) return(0); if ( n < 1 ) return(0); - inc_x *= 2; - inc_y *= 2; - lda *= 2; - lda4 = 4 * lda; + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; xbuffer = buffer; - n1 = n / 4 ; - n2 = n % 4 ; + n1 = n >> 2 ; + n2 = n & 3 ; - m3 = m % 4 ; - m1 = m - ( m % 4 ); - m2 = (m % NBMAX) - (m % 4) ; + m3 = m & 3 ; + m1 = m - m3; + m2 = (m % (NBMAX-1)) - m3 ; alpha[0] = alpha_r; alpha[1] = alpha_i; @@ -220,43 +271,84 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; - copy_x(NB,x_ptr,xbuffer,inc_x); - for( i = 0; i < n1 ; i++) + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) { - memset(ybuffer,0,64); - zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; + for( i = 0; i < n1 ; i++) + { + zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + a_ptr += lda; + y_ptr += 2; + + } } - - for( i = 0; i < n2 ; i++) + else { - memset(ybuffer,0,64); - zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,64); + zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,64); + zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } } - a += 2* NB; + a += 2 * NB; x += NB * inc_x; } @@ -265,7 +357,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( m3 == 0 ) return(0); x_ptr = x; - copy_x(m3,x_ptr,xbuffer,inc_x); j=0; a_ptr = a; y_ptr = y; From 44c11165d5146552fe091066b2db2d86d238a829 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 12 Sep 2014 14:12:24 +0200 Subject: [PATCH 091/119] bugfix in cgemv_t_4.c --- kernel/x86_64/cgemv_t_4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index f8a471ec3..208279842 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif */ -#define NBMAX 1028 +#define NBMAX 2048 #ifndef HAVE_KERNEL_4x4 @@ -313,7 +313,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for( i = 0; i < n1 ; i++) { - memset(ybuffer,0,64); + memset(ybuffer,0,32); zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); ap[0] += lda4; ap[1] += lda4; @@ -338,7 +338,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for( i = 0; i < n2 ; i++) { - memset(ybuffer,0,64); + memset(ybuffer,0,32); zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); a_ptr += lda; y_ptr[0] += ybuffer[0]; From a0fb68ab42b4bc9259bba31e366dc2fc3de6d310 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 12 Sep 2014 17:04:22 +0200 Subject: [PATCH 092/119] added optimized zgemv_t kernel for bulldozer --- kernel/x86_64/cgemv_t_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 10 +- kernel/x86_64/zgemv_t_microk_bulldozer-4.c | 223 +++++++++++++++++++++ 3 files changed, 229 insertions(+), 6 deletions(-) create mode 100644 kernel/x86_64/zgemv_t_microk_bulldozer-4.c diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index 208279842..513c585ec 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -247,7 +247,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, m3 = m & 3 ; m1 = m - m3; - m2 = (m % (NBMAX-1)) - m3 ; + m2 = (m & (NBMAX-1)) - m3 ; alpha[0] = alpha_r; alpha[1] = alpha_i; diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index f8a471ec3..a4c5fe76b 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -28,15 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/* + #if defined(BULLDOZER) || defined(PILEDRIVER) -#include "zgemv_t_microk_bulldozer-2.c" +#include "zgemv_t_microk_bulldozer-4.c" #elif defined(HASWELL) #include "zgemv_t_microk_haswell-2.c" #endif -*/ -#define NBMAX 1028 + +#define NBMAX 1024 #ifndef HAVE_KERNEL_4x4 @@ -247,7 +247,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, m3 = m & 3 ; m1 = m - m3; - m2 = (m % (NBMAX-1)) - m3 ; + m2 = (m & (NBMAX-1)) - m3 ; alpha[0] = alpha_r; alpha[1] = alpha_i; diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c new file mode 100644 index 000000000..7d635317d --- /dev/null +++ b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c @@ -0,0 +1,223 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary froms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary from must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + if ( n < 4 ) return; + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp + "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp + "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp + "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp + "vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" // temp + "vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" + "vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" + "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" + "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "prefetcht0 192(%6,%0,8) \n\t" + "vmovups (%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "prefetcht0 192(%7,%0,8) \n\t" + "vmovups (%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "vmovups 16(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "vmovups 16(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "vmovups 32(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "vmovups 32(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + "vmovups 48(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 + "vmovups 48(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%8) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" + "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" +#endif + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" + "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" + "vaddpd 32(%3) , %%xmm12, %%xmm12 \n\t" + "vaddpd 48(%3) , %%xmm14, %%xmm14 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + "vmovups %%xmm12, 32(%3) \n\t" + "vmovups %%xmm14, 48(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From a8b0812feb9d388feb0f3a8499d2520ec4d77599 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 12 Sep 2014 17:42:25 +0200 Subject: [PATCH 093/119] optimized zgemv_t for bulldozer --- kernel/x86_64/zgemv_t_microk_bulldozer-4.c | 238 ++++++++++++++++++++- 1 file changed, 236 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c index 7d635317d..006db226b 100644 --- a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c +++ b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c @@ -31,8 +31,6 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { - if ( n < 4 ) return; - BLASLONG register i = 0; __asm__ __volatile__ @@ -220,4 +218,240 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT } +#define HAVE_KERNEL_4x2 1 +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp + "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp + "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp + "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" + "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%6) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" +#endif + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x1 1 +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp + "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 16(%4,%0,8), %%xmm5 \n\t" // 1 complex values from a0 + + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x0 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x0 + + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + + "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + + "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 + "vmovups 48(%4,%0,8), %%xmm5 \n\t" // 1 complex values from a0 + + "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x0 + "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x0 + + "addq $8 , %0 \n\t" + "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + + "subq $4 , %1 \n\t" + "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 + "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 + + "jnz .L01LOOP%= \n\t" + + "vmovddup (%5) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" +#endif + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (alpha) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + From 0a1390f2d8064921fcd1286667d3f49262bc9f76 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 12 Sep 2014 17:43:47 +0200 Subject: [PATCH 094/119] enabled optimized zgemv_t kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 0fd7ac35f..289529772 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -14,7 +14,7 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S From fd2478c9e2eeb2ec26eda6aec6a9293eb2f96151 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Fri, 12 Sep 2014 19:18:23 +0200 Subject: [PATCH 095/119] optimized interface/zgemv.c for multithreading --- interface/zgemv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/zgemv.c b/interface/zgemv.c index 50513a8e4..704034aaf 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -238,7 +238,7 @@ void CNAME(enum CBLAS_ORDER order, int nthreads_avail = nthreads_max; double MNK = (double) m * (double) n; - if ( MNK <= (80.0 * 20.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) )) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) From 8a39cdb1c16940dfb3f1d0c28eb47828c27bb913 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 13 Sep 2014 09:47:07 +0200 Subject: [PATCH 096/119] added optimized zgemv_t kernel for haswell --- kernel/x86_64/KERNEL.PILEDRIVER | 2 +- kernel/x86_64/zgemv_t_microk_haswell-4.c | 428 +++++++++++++++++++++++ 2 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/zgemv_t_microk_haswell-4.c diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 4f15e5a36..fab34efd3 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t_4.S DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S diff --git a/kernel/x86_64/zgemv_t_microk_haswell-4.c b/kernel/x86_64/zgemv_t_microk_haswell-4.c new file mode 100644 index 000000000..c87b5ce0f --- /dev/null +++ b/kernel/x86_64/zgemv_t_microk_haswell-4.c @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary froms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary from must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" // temp + "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 192(%2,%0,8) \n\t" + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "prefetcht0 192(%6,%0,8) \n\t" + "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "prefetcht0 192(%7,%0,8) \n\t" + "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vmovups 32(%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 + "vmovups 32(%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%8) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm12 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm14 \n\t" +#else + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm12 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm14 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" +#endif + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + "vextractf128 $1, %%ymm12, %%xmm13 \n\t" + "vextractf128 $1, %%ymm14, %%xmm15 \n\t" + + "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddpd %%xmm14, %%xmm15, %%xmm14 \n\t" + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" + "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" + "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" + "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" + "vaddpd 32(%3) , %%xmm12, %%xmm12 \n\t" + "vaddpd 48(%3) , %%xmm14, %%xmm14 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + "vmovups %%xmm12, 32(%3) \n\t" + "vmovups %%xmm14, 48(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x2 1 +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 192(%2,%0,8) \n\t" + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "prefetcht0 192(%4,%0,8) \n\t" + "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "prefetcht0 192(%5,%0,8) \n\t" + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%6) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" + "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" +#else + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" + "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" +#endif + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + + "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + "vmovups %%xmm10, 16(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#define HAVE_KERNEL_4x1 1 +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 192(%2,%0,8) \n\t" + "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 + "prefetcht0 192(%4,%0,8) \n\t" + "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 + "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 + "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 + "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 + "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 + "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 + "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 + + "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + "vmovddup (%5) , %%xmm0 \n\t" // value from alpha + "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" + "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" +#else + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + + "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + + "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" + "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" +#else + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" + "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" +#endif + + "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" + + "vmovups %%xmm8 , (%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (alpha) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + From 5194818d4b1a2dc7d5a4768eadecb0f6c29432ec Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 13 Sep 2014 09:48:34 +0200 Subject: [PATCH 097/119] updated zgemv_t_4.c --- kernel/x86_64/zgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index a4c5fe76b..84cf4e2e8 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) #include "zgemv_t_microk_bulldozer-4.c" #elif defined(HASWELL) -#include "zgemv_t_microk_haswell-2.c" +#include "zgemv_t_microk_haswell-4.c" #endif From 1a352b24e64b403a39f7a376cb014da97e287c30 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 13 Sep 2014 12:23:27 +0200 Subject: [PATCH 098/119] updated KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index fc4a4c7f8..6e7d71e51 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -5,7 +5,7 @@ DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c -ZGEMVTKERNEL = zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c CGEMVNKERNEL = cgemv_n_4.c CGEMVTKERNEL = cgemv_t.c From 53b5726b0495dfe443260c050fc817f39e7f52ea Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 13 Sep 2014 15:14:12 +0200 Subject: [PATCH 099/119] added optimized cgemv_t kernel for haswell --- kernel/x86_64/cgemv_t_4.c | 24 +-- kernel/x86_64/cgemv_t_microk_haswell-4.c | 253 +++++++++++++++++++++++ 2 files changed, 263 insertions(+), 14 deletions(-) create mode 100644 kernel/x86_64/cgemv_t_microk_haswell-4.c diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index 513c585ec..b383a4869 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,19 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/* -#if defined(BULLDOZER) || defined(PILEDRIVER) -#include "zgemv_t_microk_bulldozer-2.c" -#elif defined(HASWELL) -#include "zgemv_t_microk_haswell-2.c" +#if defined(HASWELL) +#include "cgemv_t_microk_haswell-4.c" #endif -*/ #define NBMAX 2048 #ifndef HAVE_KERNEL_4x4 -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; @@ -112,7 +108,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT * #ifndef HAVE_KERNEL_4x2 -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1; @@ -163,7 +159,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT * #ifndef HAVE_KERNEL_4x1 -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0; @@ -281,7 +277,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for( i = 0; i < n1 ; i++) { - zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; @@ -293,7 +289,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( n2 & 2 ) { - zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); a_ptr += lda * 2; y_ptr += 4; @@ -301,7 +297,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( n2 & 1 ) { - zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); a_ptr += lda; y_ptr += 2; @@ -314,7 +310,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for( i = 0; i < n1 ; i++) { memset(ybuffer,0,32); - zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; @@ -339,7 +335,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for( i = 0; i < n2 ; i++) { memset(ybuffer,0,32); - zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); a_ptr += lda; y_ptr[0] += ybuffer[0]; y_ptr[1] += ybuffer[1]; diff --git a/kernel/x86_64/cgemv_t_microk_haswell-4.c b/kernel/x86_64/cgemv_t_microk_haswell-4.c new file mode 100644 index 000000000..e0e876edd --- /dev/null +++ b/kernel/x86_64/cgemv_t_microk_haswell-4.c @@ -0,0 +1,253 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary froms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary from must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp + "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 + "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + "cmpq $0, %1 \n\t" + "je .L08END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "prefetcht0 192(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "prefetcht0 192(%2,%0,4) \n\t" + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "prefetcht0 192(%6,%0,4) \n\t" + "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 + "prefetcht0 192(%7,%0,4) \n\t" + "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vmovups 32(%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 + "vmovups 32(%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $16 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L08END%=: \n\t" + + "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha + "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t" +#else + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" +#endif + + "vmovsd (%3), %%xmm4 \n\t" // read y + "vmovsd 8(%3), %%xmm5 \n\t" + "vmovsd 16(%3), %%xmm6 \n\t" + "vmovsd 24(%3), %%xmm7 \n\t" + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + "vextractf128 $1, %%ymm12, %%xmm13 \n\t" + "vextractf128 $1, %%ymm14, %%xmm15 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" + + "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" + "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" + "vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t" + "vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" + + + "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulps %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulps %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" + "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" + "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" + "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" + "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" + "vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t" + "vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t" +#else + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" + "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" + "vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" + "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" + "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" +#endif + + + "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" + "vaddps %%xmm12, %%xmm6 , %%xmm12 \n\t" + "vaddps %%xmm14, %%xmm7 , %%xmm14 \n\t" + + "vmovsd %%xmm8 , (%3) \n\t" + "vmovsd %%xmm10, 8(%3) \n\t" + "vmovsd %%xmm12, 16(%3) \n\t" + "vmovsd %%xmm14, 24(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 8f100a14f2a00ce33e407308d39b36fecdeafd8f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 13 Sep 2014 16:13:27 +0200 Subject: [PATCH 100/119] optimized cgemv_t kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/cgemv_t_microk_haswell-4.c | 286 +++++++++++++++++++++++ 2 files changed, 287 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 6e7d71e51..a621b4484 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -8,7 +8,7 @@ ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c CGEMVNKERNEL = cgemv_n_4.c -CGEMVTKERNEL = cgemv_t.c +CGEMVTKERNEL = cgemv_t_4.c SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/x86_64/cgemv_t_microk_haswell-4.c b/kernel/x86_64/cgemv_t_microk_haswell-4.c index e0e876edd..2c506c9e9 100644 --- a/kernel/x86_64/cgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/cgemv_t_microk_haswell-4.c @@ -251,3 +251,289 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT } +#define HAVE_KERNEL_4x2 1 +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + "cmpq $0, %1 \n\t" + "je .L08END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "prefetcht0 192(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "prefetcht0 192(%2,%0,4) \n\t" + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 + + "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $16 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L08END%=: \n\t" + + "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha + "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" + "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" + "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" +#else + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" + "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" +#endif + + "vmovsd (%3), %%xmm4 \n\t" // read y + "vmovsd 8(%3), %%xmm5 \n\t" + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + "vextractf128 $1, %%ymm10, %%xmm11 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + + "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" + "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" + + "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" + "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" + "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" + "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" +#else + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" + "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" +#endif + + + "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" + "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" + + "vmovsd %%xmm8 , (%3) \n\t" + "vmovsd %%xmm10, 8(%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x1 1 +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp + "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp + + "testq $0x04, %1 \n\t" + "jz .L08LABEL%= \n\t" + + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + + ".L08LABEL%=: \n\t" + "cmpq $0, %1 \n\t" + "je .L08END%= \n\t" + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + "prefetcht0 192(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + + "prefetcht0 192(%2,%0,4) \n\t" + "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 + + "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x + "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts + "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts + "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts + + "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 + "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 + + "addq $16 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + + ".L08END%=: \n\t" + + "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha + "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" + "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vmovsd (%3), %%xmm4 \n\t" // read y + + "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + + "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" + + "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + + "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i + "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r + +#if !defined(XCONJ) + "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" + "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" +#else + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" + "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" +#endif + + + "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" + + "vmovsd %%xmm8 , (%3) \n\t" + + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (alpha) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 9908b6031cf3b95f61ec377e6969b6328fbd655f Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 13 Sep 2014 16:26:53 +0200 Subject: [PATCH 101/119] bugfix in KERNEL.PILEDRIVER --- kernel/x86_64/KERNEL.PILEDRIVER | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index fab34efd3..55285e3d3 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -2,10 +2,11 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t_4.S +ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S + DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S From 2ac1e076c1060300f4448d767c0796af418f6e12 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 14 Sep 2014 09:02:05 +0200 Subject: [PATCH 102/119] added optimized zgemv_n kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 +- kernel/x86_64/zgemv_n_4.c | 2 + kernel/x86_64/zgemv_n_microk_sandy-4.c | 417 +++++++++++++++++++++++++ 3 files changed, 420 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/zgemv_n_microk_sandy-4.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index dfc2882aa..61e13a116 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,7 +1,7 @@ SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -ZGEMVNKERNEL = zgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMMKERNEL = sgemm_kernel_16x4_sandy.S diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index c9b5b570e..5ace6123b 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -32,6 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "zgemv_n_microk_haswell-4.c" +#elif defined(SANDYBRIDGE) +#include "zgemv_n_microk_sandy-4.c" #endif diff --git a/kernel/x86_64/zgemv_n_microk_sandy-4.c b/kernel/x86_64/zgemv_n_microk_sandy-4.c new file mode 100644 index 000000000..5567bae2b --- /dev/null +++ b/kernel/x86_64/zgemv_n_microk_sandy-4.c @@ -0,0 +1,417 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 + "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 + "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 + "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 + "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 + + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 256(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t" + "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t" + "prefetcht0 256(%5,%0,8) \n\t" + "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t" + "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" + "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t" + "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" + "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t" + "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" + "prefetcht0 256(%6,%0,8) \n\t" + "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t" + "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" + "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t" + "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" + + "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm4 , %%ymm10 \n\t" + "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" + "vmulpd %%ymm8 , %%ymm5 , %%ymm11 \n\t" + "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" + "prefetcht0 256(%7,%0,8) \n\t" + "vmulpd %%ymm9 , %%ymm4 , %%ymm10 \n\t" + "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" + "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmulpd %%ymm9 , %%ymm5 , %%ymm11 \n\t" + "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" + + "vmovups 32(%7,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm6 , %%ymm10 \n\t" + "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" + "vmulpd %%ymm8 , %%ymm7 , %%ymm11 \n\t" + "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" + "vmulpd %%ymm9 , %%ymm6 , %%ymm10 \n\t" + "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" + "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t" + "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" + + "prefetcht0 256(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x2 1 +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 256(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t" + "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t" + + "prefetcht0 256(%5,%0,8) \n\t" + "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t" + "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" + "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t" + "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" + "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t" + "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" + + "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t" + "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" + "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t" + "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" + + + "prefetcht0 256(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x1 1 +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + + ".align 16 \n\t" + ".L01LOOP%=: \n\t" + + "prefetcht0 256(%4,%0,8) \n\t" + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t" + "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t" + + "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t" + "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" + + "prefetcht0 256(%3,%0,8) \n\t" + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz .L01LOOP%= \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_ADDY 1 + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i Date: Sun, 14 Sep 2014 10:21:22 +0200 Subject: [PATCH 103/119] optimized zgemv_n_microk_sandy-4.c --- kernel/x86_64/zgemv_n_microk_sandy-4.c | 56 +++++++++++++------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/kernel/x86_64/zgemv_n_microk_sandy-4.c b/kernel/x86_64/zgemv_n_microk_sandy-4.c index 5567bae2b..009e4801e 100644 --- a/kernel/x86_64/zgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/zgemv_n_microk_sandy-4.c @@ -50,54 +50,54 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "prefetcht0 256(%4,%0,8) \n\t" + //"prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t" - "prefetcht0 256(%5,%0,8) \n\t" + //"prefetcht0 256(%5,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t" "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t" - "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t" + "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" - "prefetcht0 256(%6,%0,8) \n\t" + //"prefetcht0 256(%6,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t" - "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t" - "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" "vmulpd %%ymm8 , %%ymm4 , %%ymm10 \n\t" - "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm5 , %%ymm11 \n\t" + "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" - "prefetcht0 256(%7,%0,8) \n\t" + // "prefetcht0 256(%7,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm4 , %%ymm10 \n\t" - "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" "vmulpd %%ymm9 , %%ymm5 , %%ymm11 \n\t" - "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" "vmovups 32(%7,%0,8), %%ymm9 \n\t" // 2 complex values form a0 - - "vmulpd %%ymm8 , %%ymm6 , %%ymm10 \n\t" - "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" - "vmulpd %%ymm8 , %%ymm7 , %%ymm11 \n\t" - "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" - "vmulpd %%ymm9 , %%ymm6 , %%ymm10 \n\t" - "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" - "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t" "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" - "prefetcht0 256(%3,%0,8) \n\t" + "vmulpd %%ymm8 , %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm8 , %%ymm7 , %%ymm11 \n\t" + "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" + "vmulpd %%ymm9 , %%ymm6 , %%ymm10 \n\t" + "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t" + "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" + "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" + + // "prefetcht0 256(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%ymm10 \n\t" "vmovups 32(%3,%0,8), %%ymm11 \n\t" @@ -167,30 +167,30 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "prefetcht0 256(%4,%0,8) \n\t" + // "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t" - "prefetcht0 256(%5,%0,8) \n\t" + // "prefetcht0 256(%5,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t" "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t" - "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t" + "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t" "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t" "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t" - "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t" + "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t" "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t" - "prefetcht0 256(%3,%0,8) \n\t" + // "prefetcht0 256(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%ymm10 \n\t" "vmovups 32(%3,%0,8), %%ymm11 \n\t" @@ -256,7 +256,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" ".L01LOOP%=: \n\t" - "prefetcht0 256(%4,%0,8) \n\t" + // "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t" @@ -265,7 +265,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t" "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t" - "prefetcht0 256(%3,%0,8) \n\t" + // "prefetcht0 256(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%ymm10 \n\t" "vmovups 32(%3,%0,8), %%ymm11 \n\t" @@ -357,7 +357,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a ".align 16 \n\t" ".L01LOOP%=: \n\t" - "prefetcht0 192(%2,%0,8) \n\t" + // "prefetcht0 192(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src "vmovups 32(%2,%0,8), %%ymm9 \n\t" @@ -366,7 +366,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i - "prefetcht0 192(%3,%0,8) \n\t" + // "prefetcht0 192(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%ymm10 \n\t" // 2 complex values from dest "vmovups 32(%3,%0,8), %%ymm11 \n\t" From b7c9566eea9b19ae1834ba8f8b5ddb0517e8749b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 14 Sep 2014 11:00:53 +0200 Subject: [PATCH 104/119] removed obsolete gemv kernel files --- kernel/x86_64/cgemv_n.c | 255 ------------------- kernel/x86_64/cgemv_n_microk_haswell-2.c | 137 ----------- kernel/x86_64/cgemv_t.c | 265 -------------------- kernel/x86_64/cgemv_t_microk_haswell-2.c | 171 ------------- kernel/x86_64/dgemv_n.c | 208 ---------------- kernel/x86_64/dgemv_n_microk_haswell-2.c | 89 ------- kernel/x86_64/dgemv_n_microk_nehalem-2.c | 137 ----------- kernel/x86_64/dgemv_t.c | 191 --------------- kernel/x86_64/dgemv_t_microk_haswell-2.c | 107 -------- kernel/x86_64/sgemv_n.c | 11 - kernel/x86_64/sgemv_n_microk_bulldozer-2.c | 99 -------- kernel/x86_64/sgemv_n_microk_haswell-2.c | 88 ------- kernel/x86_64/sgemv_n_microk_nehalem-2.c | 144 ----------- kernel/x86_64/sgemv_n_microk_sandy-2.c | 97 -------- kernel/x86_64/sgemv_t.c | 10 - kernel/x86_64/sgemv_t_microk_bulldozer-2.c | 109 --------- kernel/x86_64/sgemv_t_microk_haswell-2.c | 112 --------- kernel/x86_64/sgemv_t_microk_nehalem-2.c | 159 ------------ kernel/x86_64/sgemv_t_microk_sandy-2.c | 132 ---------- kernel/x86_64/zgemv_n.c | 258 ------------------- kernel/x86_64/zgemv_n_microk_haswell-2.c | 137 ----------- kernel/x86_64/zgemv_n_microk_sandy-2.c | 149 ----------- kernel/x86_64/zgemv_t.c | 272 --------------------- kernel/x86_64/zgemv_t_microk_bulldozer-2.c | 180 -------------- kernel/x86_64/zgemv_t_microk_haswell-2.c | 162 ------------ 25 files changed, 3679 deletions(-) delete mode 100644 kernel/x86_64/cgemv_n.c delete mode 100644 kernel/x86_64/cgemv_n_microk_haswell-2.c delete mode 100644 kernel/x86_64/cgemv_t.c delete mode 100644 kernel/x86_64/cgemv_t_microk_haswell-2.c delete mode 100644 kernel/x86_64/dgemv_n.c delete mode 100644 kernel/x86_64/dgemv_n_microk_haswell-2.c delete mode 100644 kernel/x86_64/dgemv_n_microk_nehalem-2.c delete mode 100644 kernel/x86_64/dgemv_t.c delete mode 100644 kernel/x86_64/dgemv_t_microk_haswell-2.c delete mode 100644 kernel/x86_64/sgemv_n_microk_bulldozer-2.c delete mode 100644 kernel/x86_64/sgemv_n_microk_haswell-2.c delete mode 100644 kernel/x86_64/sgemv_n_microk_nehalem-2.c delete mode 100644 kernel/x86_64/sgemv_n_microk_sandy-2.c delete mode 100644 kernel/x86_64/sgemv_t_microk_bulldozer-2.c delete mode 100644 kernel/x86_64/sgemv_t_microk_haswell-2.c delete mode 100644 kernel/x86_64/sgemv_t_microk_nehalem-2.c delete mode 100644 kernel/x86_64/sgemv_t_microk_sandy-2.c delete mode 100644 kernel/x86_64/zgemv_n.c delete mode 100644 kernel/x86_64/zgemv_n_microk_haswell-2.c delete mode 100644 kernel/x86_64/zgemv_n_microk_sandy-2.c delete mode 100644 kernel/x86_64/zgemv_t.c delete mode 100644 kernel/x86_64/zgemv_t_microk_bulldozer-2.c delete mode 100644 kernel/x86_64/zgemv_t_microk_haswell-2.c diff --git a/kernel/x86_64/cgemv_n.c b/kernel/x86_64/cgemv_n.c deleted file mode 100644 index 47ef0d447..000000000 --- a/kernel/x86_64/cgemv_n.c +++ /dev/null @@ -1,255 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include -#include -#include "common.h" - -#if defined(HASWELL) -#include "cgemv_n_microk_haswell-2.c" -#endif - - -#define NBMAX 2048 - -#ifndef HAVE_KERNEL_16x4 - -static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - - for ( i=0; i< 2*n; i+=2 ) - { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; - y[i] += a1[i]*x[2] - a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; - y[i] += a2[i]*x[4] - a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; - y[i] += a3[i]*x[6] - a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; -#else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; - y[i] += a1[i]*x[2] + a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; - y[i] += a2[i]*x[4] + a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; - y[i] += a3[i]*x[6] + a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; -#endif - } -} - -#endif - -static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0; - a0 = ap; - - for ( i=0; i< 2*n; i+=2 ) - { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; -#else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; -#endif - - } -} - - -static void zero_y(BLASLONG n, FLOAT *dest) -{ - BLASLONG i; - for ( i=0; i<2*n; i++ ) - { - *dest = 0.0; - dest++; - } -} - - - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i -#include -#include "common.h" - -#if defined(HASWELL) -#include "zgemv_n_microk_haswell-2.c" -#elif defined(SANDYBRIDGE) -#include "zgemv_n_microk_sandy-2.c" -#endif - - - -#define NBMAX 1024 - -#ifndef HAVE_KERNEL_16x4 - -static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - - for ( i=0; i< 2*n; i+=2 ) - { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; - y[i] += a1[i]*x[2] - a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; - y[i] += a2[i]*x[4] - a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; - y[i] += a3[i]*x[6] - a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; -#else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; - y[i] += a1[i]*x[2] + a1[i+1] * x[3]; - y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; - y[i] += a2[i]*x[4] + a2[i+1] * x[5]; - y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; - y[i] += a3[i]*x[6] + a3[i+1] * x[7]; - y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; -#endif - } -} - -#endif - -static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0; - a0 = ap; - - for ( i=0; i< 2*n; i+=2 ) - { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i]*x[0] - a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; -#else - y[i] += a0[i]*x[0] + a0[i+1] * x[1]; - y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; -#endif - - } -} - - -static void zero_y(BLASLONG n, FLOAT *dest) -{ - BLASLONG i; - for ( i=0; i<2*n; i++ ) - { - *dest = 0.0; - dest++; - } -} - - - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i Date: Mon, 15 Sep 2014 11:38:25 +0200 Subject: [PATCH 105/119] optimized multithreading lower limits --- interface/gemm.c | 41 ++--------------------------------------- 1 file changed, 2 insertions(+), 39 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index 74908e842..a5a2b4724 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -405,49 +405,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #ifndef COMPLEX double MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (16.0 * 1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; - else - { - if ( MNK <= (2.0 * 65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - { - nthreads_max = 4; - if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD ) - { - nthreads_max = 2; - if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; - if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; - if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; - } - else - { - if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2; - } - } - } #else double MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) + if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; - else - { - if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - { - nthreads_max = 4; - if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) - { - nthreads_max = 2; - if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; - if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; - if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; - } - else - { - if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2; - } - } - } - #endif args.common = NULL; From 9912dbbcf9e011bc00431cab0ffa7978904578b7 Mon Sep 17 00:00:00 2001 From: Eliot Eshelman Date: Tue, 16 Sep 2014 18:26:45 -0400 Subject: [PATCH 106/119] Add HASWELL to TargetList.txt The Intel "Haswell" architecture is missing from the list of build targets. --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index ce35a3faa..8c9746672 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -19,6 +19,7 @@ PENRYN DUNNINGTON NEHALEM SANDYBRIDGE +HASWELL ATOM b)AMD CPU: From 70d1ba09b23c2acfcf049f79075e66eb08d60aa3 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 17 Sep 2014 14:29:21 +0800 Subject: [PATCH 107/119] Update the doc for target list. --- TargetList.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TargetList.txt b/TargetList.txt index 8c9746672..97661fdcf 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -31,6 +31,7 @@ SHANGHAI ISTANBUL BOBCAT BULLDOZER +PILEDRIVER c)VIA CPU: SSE_GENERIC @@ -60,3 +61,7 @@ ITANIUM2 SPARC SPARCV7 +6.ARM CPU: +ARMV7 +ARMV6 +ARMV5 From 466bfb8b86f36b7403b1e8bc361fd94716ba30c6 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 17 Sep 2014 16:01:07 +0200 Subject: [PATCH 108/119] updated README.md --- README.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2e85117c9..f4c547701 100644 --- a/README.md +++ b/README.md @@ -55,16 +55,23 @@ Please read GotoBLAS_01Readme.txt #### x86/x86-64: - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. -- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. -- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge). +- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. +- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. -- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) -- **AMD PILEDRIVER**: Used Bulldozer codes. +- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) +- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. #### MIPS64: - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3B**: Experimental +#### ARM: +- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ ) +- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 ) + +#### ARM64: +- **ARMV8**: Experimental + ### Support OS: - **GNU/Linux** - **MingWin/Windows**: Please read . @@ -116,8 +123,8 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. -* The number of CPUs/Cores should less than or equal to 256. -* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. +* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. +* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. ## Contributing From 7a911569b8502d97075bb63ebcbfed18bc13bc97 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 20 Sep 2014 14:21:42 +0200 Subject: [PATCH 109/119] added test for GEMM3M functions --- test/Makefile | 25 + test/zblat3_3m.dat | 23 + test/zblat3_3m.f | 3448 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 3496 insertions(+) create mode 100644 test/zblat3_3m.dat create mode 100644 test/zblat3_3m.f diff --git a/test/Makefile b/test/Makefile index 801efe244..dce18c824 100644 --- a/test/Makefile +++ b/test/Makefile @@ -88,6 +88,25 @@ else endif endif + +level3_3m : zblat3_3m + rm -f ?BLAT3_3M.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3_3m < ./zblat3_3m.dat + @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 +ifdef SMP + rm -f ?BLAT3_3M.SUMM +ifeq ($(USE_OPENMP), 1) + OMP_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat + @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 +else + OPENBLAS_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat + @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 +endif +endif + + + + FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) CEXTRALIB = @@ -131,6 +150,11 @@ cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + + + clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ @@ -139,6 +163,7 @@ clean: sblat1p dblat1p cblat1p zblat1p \ sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ + zblat3_3m zblat3_3mp \ *.stackdump *.dll libs: diff --git a/test/zblat3_3m.dat b/test/zblat3_3m.dat new file mode 100644 index 000000000..629b5974a --- /dev/null +++ b/test/zblat3_3m.dat @@ -0,0 +1,23 @@ +'ZBLAT3_3M.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +F LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. +ZHEMM3M F PUT F FOR NO TEST. SAME COLUMNS. +ZSYMM3M F PUT F FOR NO TEST. SAME COLUMNS. +ZTRMM F PUT F FOR NO TEST. SAME COLUMNS. +ZTRSM F PUT F FOR NO TEST. SAME COLUMNS. +ZHERK F PUT F FOR NO TEST. SAME COLUMNS. +ZSYRK F PUT F FOR NO TEST. SAME COLUMNS. +ZHER2K F PUT F FOR NO TEST. SAME COLUMNS. +ZSYR2K F PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/zblat3_3m.f b/test/zblat3_3m.f new file mode 100644 index 000000000..9bc412a53 --- /dev/null +++ b/test/zblat3_3m.f @@ -0,0 +1,3448 @@ + PROGRAM ZBLAT3 +* +* Test program for the COMPLEX*16 Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 23 lines: +* 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMM3M T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYMM3M T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*8 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*8 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*8 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'ZGEMM3M ', 'ZHEMM3M ', 'ZSYMM3M ', + $ 'ZTRMM ', + $ 'ZTRSM ', 'ZHERK ', 'ZSYRK ', 'ZHER2K', + $ 'ZSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from ZMMCH CT holds +* the result computed by ZMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test ZGEMM3M, 01. + 140 CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test ZHEMM3M, 02, ZSYMM3M, 03. + 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test ZTRMM, 04, ZTRSM, 05. + 160 CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test ZHERK, 06, ZSYRK, 07. + 170 CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test ZHER2K, 08, ZSYR2K, 09. + 180 CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A8, L2 ) + 9987 FORMAT( 1X, A8, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT3. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests ZGEMM3M. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZGEMM3M, ZMAKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL ZMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZGEMM3M( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LZERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A8, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests ZHEMM3M and ZSYMM3M. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHEMM3M, ZMAKE, ZMMCH, ZSYMM3M +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL ZHEMM3M( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + ELSE + CALL ZSYMM3M( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK2. +* + END + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests ZTRMM and ZTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMMCH, ZTRMM, ZTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for ZMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL ZMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL ZTRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL ZTRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LZE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LZE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LZERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests ZHERK and ZSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHERK, ZMAKE, ZMMCH, ZSYRK +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO + RALS = RONE + RBETS = RONE +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = DBLE( ALPHA ) + ALPHA = DCMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, RALPHA, LDA, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZHERK( UPLO, TRANS, N, K, RALPHA, AA, + $ LDA, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZSYRK( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LZERES( SNAME( 2: 3 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL ZMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, + $ LDA, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK4. +* + END + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests ZHER2K and ZSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHER2K, ZMAKE, ZMMCH, ZSYR2K +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZHER2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZSYR2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'HE', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = DCONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*DCONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = DCONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC + END IF +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK5. +* + END + SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*8 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION RALPHA, RBETA +* .. Local Arrays .. + COMPLEX*16 A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL ZGEMM3M, ZHEMM3M, ZHER2K, ZHERK, CHKXER, ZSYMM3M, + $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90 )ISNUM + 10 INFOT = 1 + CALL ZGEMM3M( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL ZGEMM3M( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL ZGEMM3M( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMM3M( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMM3M( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMM3M( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM3M( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM3M( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM3M( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM3M( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM3M( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM3M( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM3M( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM3M( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM3M( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM3M( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM3M( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM3M( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM3M( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM3M( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM3M( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM3M( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM3M( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM3M( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM3M( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM3M( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM3M( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM3M( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM3M( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM3M( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM3M( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM3M( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM3M( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM3M( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM3M( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM3M( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM3M( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM3M( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM3M( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM3M( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM3M( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM3M( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM3M( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM3M( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM3M( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM3M( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM3M( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM3M( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM3M( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM3M( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 20 INFOT = 1 + CALL ZHEMM3M( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHEMM3M( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM3M( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM3M( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM3M( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM3M( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM3M( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM3M( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM3M( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM3M( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM3M( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM3M( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 30 INFOT = 1 + CALL ZSYMM3M( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYMM3M( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM3M( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM3M( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM3M( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM3M( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM3M( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM3M( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM3M( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM3M( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM3M( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM3M( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 40 INFOT = 1 + CALL ZTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 50 INFOT = 1 + CALL ZTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 60 INFOT = 1 + CALL ZHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 70 INFOT = 1 + CALL ZSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 80 INFOT = 1 + CALL ZHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 90 INFOT = 1 + CALL ZSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 100 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of ZCHKE. +* + END + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'HE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, DBLE +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + HER = TYPE.EQ.'HE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END + SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 CL + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of ZMMCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'HE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*8 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*8 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*8 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + From 7aae4a62e78daa586774248aa31679311d7bd5cc Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 20 Sep 2014 14:27:10 +0200 Subject: [PATCH 110/119] enabled use of GEMM3M functions --- common_param.h | 109 +++++++++++++++++++++++++++++++++++------ driver/level3/Makefile | 4 +- exports/gensymbol | 4 +- interface/Makefile | 4 +- kernel/setparam-ref.c | 90 ++++++++++++++++++++++++++++++++++ param.h | 48 ++++++++++++++++-- 6 files changed, 235 insertions(+), 24 deletions(-) diff --git a/common_param.h b/common_param.h index 1c362e8cb..e9f35c033 100644 --- a/common_param.h +++ b/common_param.h @@ -435,6 +435,9 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int cgemm3m_p, cgemm3m_q, cgemm3m_r; + int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn; + int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -595,6 +598,9 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int zgemm3m_p, zgemm3m_q, zgemm3m_r; + int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn; + int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); @@ -757,6 +763,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int xgemm3m_p, xgemm3m_q, xgemm3m_r; + int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn; + int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); @@ -900,6 +909,27 @@ extern gotoblas_t *gotoblas; #define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n #define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn +#define CGEMM3M_P gotoblas -> cgemm3m_p +#define CGEMM3M_Q gotoblas -> cgemm3m_q +#define CGEMM3M_R gotoblas -> cgemm3m_r +#define CGEMM3M_UNROLL_M gotoblas -> cgemm3m_unroll_m +#define CGEMM3M_UNROLL_N gotoblas -> cgemm3m_unroll_n +#define CGEMM3M_UNROLL_MN gotoblas -> cgemm3m_unroll_mn + +#define ZGEMM3M_P gotoblas -> zgemm3m_p +#define ZGEMM3M_Q gotoblas -> zgemm3m_q +#define ZGEMM3M_R gotoblas -> zgemm3m_r +#define ZGEMM3M_UNROLL_M gotoblas -> zgemm3m_unroll_m +#define ZGEMM3M_UNROLL_N gotoblas -> zgemm3m_unroll_n +#define ZGEMM3M_UNROLL_MN gotoblas -> zgemm3m_unroll_mn + +#define XGEMM3M_P gotoblas -> xgemm3m_p +#define XGEMM3M_Q gotoblas -> xgemm3m_q +#define XGEMM3M_R gotoblas -> xgemm3m_r +#define XGEMM3M_UNROLL_M gotoblas -> xgemm3m_unroll_m +#define XGEMM3M_UNROLL_N gotoblas -> xgemm3m_unroll_n +#define XGEMM3M_UNROLL_MN gotoblas -> xgemm3m_unroll_mn + #else #define DTB_ENTRIES DTB_DEFAULT_ENTRIES @@ -972,6 +1002,55 @@ extern gotoblas_t *gotoblas; #define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N #define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) +#ifdef CGEMM_DEFAULT_UNROLL_N + +#define CGEMM3M_P CGEMM3M_DEFAULT_P +#define CGEMM3M_Q CGEMM3M_DEFAULT_Q +#define CGEMM3M_R CGEMM3M_DEFAULT_R +#define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M +#define CGEMM3M_UNROLL_N CGEMM3M_DEFAULT_UNROLL_N +#define CGEMM3M_UNROLL_MN MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N)) + +#else + +#define CGEMM3M_P SGEMM_DEFAULT_P +#define CGEMM3M_Q SGEMM_DEFAULT_Q +#define CGEMM3M_R SGEMM_DEFAULT_R +#define CGEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define CGEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#define CGEMM3M_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) + +#endif + + +#ifdef ZGEMM_DEFAULT_UNROLL_N + +#define ZGEMM3M_P ZGEMM3M_DEFAULT_P +#define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q +#define ZGEMM3M_R ZGEMM3M_DEFAULT_R +#define ZGEMM3M_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M +#define ZGEMM3M_UNROLL_N ZGEMM3M_DEFAULT_UNROLL_N +#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) + +#else + +#define ZGEMM3M_P DGEMM_DEFAULT_P +#define ZGEMM3M_Q DGEMM_DEFAULT_Q +#define ZGEMM3M_R DGEMM_DEFAULT_R +#define ZGEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define ZGEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) + +#endif + +#define XGEMM3M_P QGEMM_DEFAULT_P +#define XGEMM3M_Q QGEMM_DEFAULT_Q +#define XGEMM3M_R QGEMM_DEFAULT_R +#define XGEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define XGEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#define XGEMM3M_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) + + #endif #endif @@ -1054,14 +1133,14 @@ extern gotoblas_t *gotoblas; #endif #ifdef XDOUBLE -#define GEMM3M_UNROLL_M QGEMM_UNROLL_M -#define GEMM3M_UNROLL_N QGEMM_UNROLL_N +#define GEMM3M_UNROLL_M XGEMM3M_UNROLL_M +#define GEMM3M_UNROLL_N XGEMM3M_UNROLL_N #elif defined(DOUBLE) -#define GEMM3M_UNROLL_M DGEMM_UNROLL_M -#define GEMM3M_UNROLL_N DGEMM_UNROLL_N +#define GEMM3M_UNROLL_M ZGEMM3M_UNROLL_M +#define GEMM3M_UNROLL_N ZGEMM3M_UNROLL_N #else -#define GEMM3M_UNROLL_M SGEMM_UNROLL_M -#define GEMM3M_UNROLL_N SGEMM_UNROLL_N +#define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M +#define GEMM3M_UNROLL_N CGEMM3M_UNROLL_N #endif @@ -1123,31 +1202,31 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_P #ifdef XDOUBLE -#define GEMM3M_P QGEMM_P +#define GEMM3M_P XGEMM3M_P #elif defined(DOUBLE) -#define GEMM3M_P DGEMM_P +#define GEMM3M_P ZGEMM3M_P #else -#define GEMM3M_P SGEMM_P +#define GEMM3M_P CGEMM3M_P #endif #endif #ifndef GEMM3M_Q #ifdef XDOUBLE -#define GEMM3M_Q QGEMM_Q +#define GEMM3M_Q XGEMM3M_Q #elif defined(DOUBLE) -#define GEMM3M_Q DGEMM_Q +#define GEMM3M_Q ZGEMM3M_Q #else -#define GEMM3M_Q SGEMM_Q +#define GEMM3M_Q CGEMM3M_Q #endif #endif #ifndef GEMM3M_R #ifdef XDOUBLE -#define GEMM3M_R QGEMM_R +#define GEMM3M_R XGEMM3M_R #elif defined(DOUBLE) -#define GEMM3M_R DGEMM_R +#define GEMM3M_R ZGEMM3M_R #else -#define GEMM3M_R SGEMM_R +#define GEMM3M_R CGEMM3M_R #endif #endif diff --git a/driver/level3/Makefile b/driver/level3/Makefile index d62921e84..352225206 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -4,11 +4,11 @@ include ../../Makefile.system USE_GEMM3M = 0 ifeq ($(ARCH), x86) -USE_GEMM3M = 0 +USE_GEMM3M = 1 endif ifeq ($(ARCH), x86_64) -USE_GEMM3M = 0 +USE_GEMM3M = 1 endif ifeq ($(ARCH), ia64) diff --git a/exports/gensymbol b/exports/gensymbol index e5049678a..69454d71b 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -75,7 +75,9 @@ ); @gemm3mobjs = ( - + cgemm3m,zgemm3m, + chemm3m,zhemm3m, + csymm3m,zsymm3m ); diff --git a/interface/Makefile b/interface/Makefile index cced14fb2..567224119 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -4,11 +4,11 @@ include $(TOPDIR)/Makefile.system SUPPORT_GEMM3M = 0 ifeq ($(ARCH), x86) -SUPPORT_GEMM3M = 0 +SUPPORT_GEMM3M = 1 endif ifeq ($(ARCH), x86_64) -SUPPORT_GEMM3M = 0 +SUPPORT_GEMM3M = 1 endif ifeq ($(ARCH), ia64) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index b1beeae5c..0d7bbd4ac 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -293,6 +293,14 @@ gotoblas_t TABLE_NAME = { #endif chemm_outcopyTS, chemm_oltcopyTS, + 0, 0, 0, +#ifdef CGEMM3M_DEFAULT_UNROLL_M + CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N), +#else + SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N), +#endif + + cgemm3m_kernelTS, cgemm3m_incopybTS, cgemm3m_incopyrTS, @@ -391,6 +399,14 @@ gotoblas_t TABLE_NAME = { #endif zhemm_outcopyTS, zhemm_oltcopyTS, + 0, 0, 0, +#ifdef ZGEMM3M_DEFAULT_UNROLL_M + ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N), +#else + DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), +#endif + + zgemm3m_kernelTS, zgemm3m_incopybTS, zgemm3m_incopyrTS, @@ -486,6 +502,9 @@ gotoblas_t TABLE_NAME = { #endif xhemm_outcopyTS, xhemm_oltcopyTS, + 0, 0, 0, + QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), + xgemm3m_kernelTS, xgemm3m_incopybTS, xgemm3m_incopyrTS, @@ -661,9 +680,23 @@ static void init_parameter(void) { TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + +#ifdef CGEMM3M_DEFAULT_Q + TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.cgemm3m_q = SGEMM_DEFAULT_Q; +#endif + +#ifdef ZGEMM3M_DEFAULT_Q + TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.zgemm3m_q = DGEMM_DEFAULT_Q; +#endif + #ifdef EXPRECISION TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; + TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q; #endif #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) @@ -918,20 +951,56 @@ static void init_parameter(void) { TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + + #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif + #endif +#ifdef CGEMM3M_DEFAULT_P + TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; +#else + TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; +#endif + +#ifdef ZGEMM3M_DEFAULT_P + TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; +#else + TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; +#endif + +#ifdef EXPRECISION + TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; +#endif + + + TABLE_NAME.sgemm_p = (TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.dgemm_p = (TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.cgemm_p = (TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1) & ~(CGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.zgemm_p = (TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1) & ~(ZGEMM_DEFAULT_UNROLL_M - 1); + +#ifdef CGEMM3M_DEFAULT_UNROLL_M + TABLE_NAME.cgemm3m_p = (TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1) & ~(CGEMM3M_DEFAULT_UNROLL_M - 1); +#else + TABLE_NAME.cgemm3m_p = (TABLE_NAME.cgemm3m_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1); +#endif + +#ifdef ZGEMM3M_DEFAULT_UNROLL_M + TABLE_NAME.zgemm3m_p = (TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1) & ~(ZGEMM3M_DEFAULT_UNROLL_M - 1); +#else + TABLE_NAME.zgemm3m_p = (TABLE_NAME.zgemm3m_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1); +#endif + #ifdef QUAD_PRECISION TABLE_NAME.qgemm_p = (TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1); TABLE_NAME.xgemm_p = (TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1) & ~(XGEMM_DEFAULT_UNROLL_M - 1); + TABLE_NAME.xgemm3m_p = (TABLE_NAME.xgemm3m_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1); #endif #ifdef DEBUG @@ -965,11 +1034,32 @@ static void init_parameter(void) { + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); + TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE - + ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15); + + TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE - + ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15); + + + + #ifdef EXPRECISION TABLE_NAME.xgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15); + + TABLE_NAME.xgemm3m_r = (((BUFFER_SIZE - + ((TABLE_NAME.xgemm3m_p * TABLE_NAME.xgemm3m_q * 32 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.xgemm3m_q * 32) - 15) & ~15); + #endif + + } diff --git a/param.h b/param.h index 82f4ad842..4adb0a1de 100644 --- a/param.h +++ b/param.h @@ -289,6 +289,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + #define SGEMM_DEFAULT_R sgemm_r #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R dgemm_r @@ -371,6 +381,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define QGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + #define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R 12288 @@ -1073,10 +1093,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GETRF_FACTOR 0.72 -#define CGEMM3M_DEFAULT_UNROLL_N 4 -#define CGEMM3M_DEFAULT_UNROLL_M 8 -#define ZGEMM3M_DEFAULT_UNROLL_N 2 -#define ZGEMM3M_DEFAULT_UNROLL_M 8 #endif @@ -1157,6 +1173,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM3M_DEFAULT_UNROLL_N 2 #define ZGEMM3M_DEFAULT_UNROLL_M 8 +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + + + #define GETRF_FACTOR 0.72 #endif @@ -1263,6 +1291,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 2 #define ZGEMM3M_DEFAULT_UNROLL_M 8 + + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + #endif From f0f9b25bb67129d5dbde1bae1f9b9be5c5a657a7 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 20 Sep 2014 14:53:30 +0200 Subject: [PATCH 111/119] added test for CGEMM3M function --- test/Makefile | 13 +- test/cblat3_3m.dat | 23 + test/cblat3_3m.f | 3442 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 3477 insertions(+), 1 deletion(-) create mode 100644 test/cblat3_3m.dat create mode 100644 test/cblat3_3m.f diff --git a/test/Makefile b/test/Makefile index dce18c824..75ea6de60 100644 --- a/test/Makefile +++ b/test/Makefile @@ -89,16 +89,22 @@ endif endif -level3_3m : zblat3_3m +level3_3m : zblat3_3m cblat3_3m rm -f ?BLAT3_3M.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat + @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3_3m < ./zblat3_3m.dat @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 ifdef SMP rm -f ?BLAT3_3M.SUMM ifeq ($(USE_OPENMP), 1) + OMP_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat + @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 OMP_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 else + OPENBLAS_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat + @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 endif @@ -150,11 +156,15 @@ cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ @@ -164,6 +174,7 @@ clean: sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ zblat3_3m zblat3_3mp \ + cblat3_3m cblat3_3mp \ *.stackdump *.dll libs: diff --git a/test/cblat3_3m.dat b/test/cblat3_3m.dat new file mode 100644 index 000000000..fa7b96ff3 --- /dev/null +++ b/test/cblat3_3m.dat @@ -0,0 +1,23 @@ +'CBLAT3_3M.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +F LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +CGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. +CHEMM3M F PUT F FOR NO TEST. SAME COLUMNS. +CSYMM3M F PUT F FOR NO TEST. SAME COLUMNS. +CTRMM F PUT F FOR NO TEST. SAME COLUMNS. +CTRSM F PUT F FOR NO TEST. SAME COLUMNS. +CHERK F PUT F FOR NO TEST. SAME COLUMNS. +CSYRK F PUT F FOR NO TEST. SAME COLUMNS. +CHER2K F PUT F FOR NO TEST. SAME COLUMNS. +CSYR2K F PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/cblat3_3m.f b/test/cblat3_3m.f new file mode 100644 index 000000000..6cb366d64 --- /dev/null +++ b/test/cblat3_3m.f @@ -0,0 +1,3442 @@ + PROGRAM CBLAT3 +* +* Test program for the COMPLEX Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A8, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 23 lines: +* 'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* CGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. +* CHEMM3M T PUT F FOR NO TEST. SAME COLUMNS. +* CSYMM3M T PUT F FOR NO TEST. SAME COLUMNS. +* CTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* CTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* CHERK T PUT F FOR NO TEST. SAME COLUMNS. +* CSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* CHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*8 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*8 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*8 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'CGEMM3M ', 'CHEMM3M ', 'CSYMM3M ', + $ 'CTRMM ', + $ 'CTRSM ', 'CHERK ', 'CSYRK ', 'CHER2K', + $ 'CSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from CMMCH CT holds +* the result computed by CMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test CGEMM3M, 01. + 140 CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test CHEMM3M, 02, CSYMM3M, 03. + 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test CTRMM, 04, CTRSM, 05. + 160 CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test CHERK, 06, CSYRK, 07. + 170 CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test CHER2K, 08, CSYR2K, 09. + 180 CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A8, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A8, L2 ) + 9987 FORMAT( 1X, A8, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT3. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests CGEMM3M. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CGEMM3M, CMAKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL CMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CGEMM3M( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LCERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A8, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests CHEMM3M and CSYMM3M. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHEMM3M, CMAKE, CMMCH, CSYMM3M +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CHEMM3M( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + ELSE + CALL CSYMM3M( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests CTRMM and CTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS + REAL ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMMCH, CTRMM, CTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for CMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL CMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL CTRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL CTRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LCE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LCE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LCERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A8, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests CHERK and CSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHERK, CMAKE, CMMCH, CSYRK +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO + RALS = RONE + RBETS = RONE +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = REAL( ALPHA ) + ALPHA = CMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, RALPHA, LDA, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CHERK( UPLO, TRANS, N, K, RALPHA, AA, + $ LDA, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CSYRK( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LCERES( SNAME( 2: 3 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL CMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, + $ LDA, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests CHER2K and CSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*8 SNAME +* .. Array Arguments .. + COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHER2K, CMAKE, CMMCH, CSYR2K +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CHER2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CSYR2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'HE', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = CONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*CONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = CONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC + END IF +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END + SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*8 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX ALPHA, BETA + REAL RALPHA, RBETA +* .. Local Arrays .. + COMPLEX A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL CGEMM3M, CHEMM3M, CHER2K, CHERK, CHKXER, CSYMM3M, + $ CSYR2K, CSYRK, CTRMM, CTRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90 )ISNUM + 10 INFOT = 1 + CALL CGEMM3M( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL CGEMM3M( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL CGEMM3M( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMM3M( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMM3M( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMM3M( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM3M( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM3M( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM3M( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM3M( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM3M( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM3M( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM3M( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM3M( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM3M( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM3M( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM3M( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM3M( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM3M( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM3M( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM3M( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM3M( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM3M( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM3M( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM3M( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM3M( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM3M( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM3M( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM3M( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM3M( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM3M( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM3M( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM3M( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM3M( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM3M( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM3M( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM3M( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM3M( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM3M( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM3M( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM3M( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM3M( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM3M( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM3M( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM3M( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM3M( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM3M( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM3M( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM3M( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM3M( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 20 INFOT = 1 + CALL CHEMM3M( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHEMM3M( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM3M( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM3M( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM3M( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM3M( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM3M( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM3M( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM3M( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM3M( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM3M( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM3M( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 30 INFOT = 1 + CALL CSYMM3M( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYMM3M( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM3M( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM3M( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM3M( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM3M( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM3M( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM3M( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM3M( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM3M( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM3M( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM3M( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 40 INFOT = 1 + CALL CTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 50 INFOT = 1 + CALL CTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 60 INFOT = 1 + CALL CHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 70 INFOT = 1 + CALL CSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 80 INFOT = 1 + CALL CHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 90 INFOT = 1 + CALL CSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 100 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A8, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A8, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of CCHKE. +* + END + SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'HE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, REAL +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + HER = TYPE.EQ.'HE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = CBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END + SUBROUTINE CMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX CL + REAL ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )* + $ CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of CMMCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'HE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*8 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A8, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*8 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*8 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A8, ' INSTE', + $ 'AD OF ', A8, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + From d49fd33885d7f06ca545f3061dbb815be704cf58 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 20 Sep 2014 15:27:40 +0200 Subject: [PATCH 112/119] disabled SYMM3M and HEMM3M functions because segment violations --- exports/gensymbol | 4 +- interface/Makefile | 6 ++- test/cblat3_3m.dat | 4 +- test/cblat3_3m.f | 106 ++++++++++++++++++++++----------------------- test/zblat3_3m.dat | 4 +- test/zblat3_3m.f | 106 ++++++++++++++++++++++----------------------- 6 files changed, 115 insertions(+), 115 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 69454d71b..2a50a8df2 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -75,9 +75,7 @@ ); @gemm3mobjs = ( - cgemm3m,zgemm3m, - chemm3m,zhemm3m, - csymm3m,zsymm3m + cgemm3m,zgemm3m ); diff --git a/interface/Makefile b/interface/Makefile index 567224119..65f4e446d 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -128,9 +128,11 @@ ZBLAS3OBJS = \ ifeq ($(SUPPORT_GEMM3M), 1) -CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) +# CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) +CBLAS3OBJS += cgemm3m.$(SUFFIX) -ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) +# ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) +ZBLAS3OBJS += zgemm3m.$(SUFFIX) endif diff --git a/test/cblat3_3m.dat b/test/cblat3_3m.dat index fa7b96ff3..cc1a2cef4 100644 --- a/test/cblat3_3m.dat +++ b/test/cblat3_3m.dat @@ -13,8 +13,8 @@ F LOGICAL FLAG, T TO TEST ERROR EXITS. 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA CGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. -CHEMM3M F PUT F FOR NO TEST. SAME COLUMNS. -CSYMM3M F PUT F FOR NO TEST. SAME COLUMNS. +CHEMM F PUT F FOR NO TEST. SAME COLUMNS. +CSYMM F PUT F FOR NO TEST. SAME COLUMNS. CTRMM F PUT F FOR NO TEST. SAME COLUMNS. CTRSM F PUT F FOR NO TEST. SAME COLUMNS. CHERK F PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/cblat3_3m.f b/test/cblat3_3m.f index 6cb366d64..19f7830be 100644 --- a/test/cblat3_3m.f +++ b/test/cblat3_3m.f @@ -22,8 +22,8 @@ * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * CGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. -* CHEMM3M T PUT F FOR NO TEST. SAME COLUMNS. -* CSYMM3M T PUT F FOR NO TEST. SAME COLUMNS. +* CHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* CSYMM T PUT F FOR NO TEST. SAME COLUMNS. * CTRMM T PUT F FOR NO TEST. SAME COLUMNS. * CTRSM T PUT F FOR NO TEST. SAME COLUMNS. * CHERK T PUT F FOR NO TEST. SAME COLUMNS. @@ -94,7 +94,7 @@ COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. - DATA SNAMES/'CGEMM3M ', 'CHEMM3M ', 'CSYMM3M ', + DATA SNAMES/'CGEMM3M ', 'CHEMM ', 'CSYMM ', $ 'CTRMM ', $ 'CTRSM ', 'CHERK ', 'CSYRK ', 'CHER2K', $ 'CSYR2K'/ @@ -289,7 +289,7 @@ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 -* Test CHEMM3M, 02, CSYMM3M, 03. +* Test CHEMM, 02, CSYMM, 03. 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, @@ -653,7 +653,7 @@ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * -* Tests CHEMM3M and CSYMM3M. +* Tests CHEMM and CSYMM. * * Auxiliary routine for test program for Level 3 Blas. * @@ -696,7 +696,7 @@ LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. - EXTERNAL CHEMM3M, CMAKE, CMMCH, CSYMM3M + EXTERNAL CHEMM, CMAKE, CMMCH, CSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. @@ -811,10 +811,10 @@ IF( REWI ) $ REWIND NTRA IF( CONJ )THEN - CALL CHEMM3M( SIDE, UPLO, M, N, ALPHA, AA, LDA, + CALL CHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) ELSE - CALL CSYMM3M( SIDE, UPLO, M, N, ALPHA, AA, LDA, + CALL CSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) END IF * @@ -1971,7 +1971,7 @@ * .. Local Arrays .. COMPLEX A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) * .. External Subroutines .. - EXTERNAL CGEMM3M, CHEMM3M, CHER2K, CHERK, CHKXER, CSYMM3M, + EXTERNAL CGEMM3M, CHEMM, CHER2K, CHERK, CHKXER, CSYMM, $ CSYR2K, CSYRK, CTRMM, CTRSM * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR @@ -2166,137 +2166,137 @@ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 20 INFOT = 1 - CALL CHEMM3M( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL CHEMM3M( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CHEMM3M( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CHEMM3M( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CHEMM3M( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CHEMM3M( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CHEMM3M( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CHEMM3M( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CHEMM3M( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CHEMM3M( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL CHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL CHEMM3M( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL CHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL CHEMM3M( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL CHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL CHEMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL CHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL CHEMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 30 INFOT = 1 - CALL CSYMM3M( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL CSYMM3M( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CSYMM3M( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CSYMM3M( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CSYMM3M( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CSYMM3M( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CSYMM3M( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CSYMM3M( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CSYMM3M( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CSYMM3M( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL CSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL CSYMM3M( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL CSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL CSYMM3M( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL CSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL CSYMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL CSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL CSYMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 40 INFOT = 1 diff --git a/test/zblat3_3m.dat b/test/zblat3_3m.dat index 629b5974a..f48cc19db 100644 --- a/test/zblat3_3m.dat +++ b/test/zblat3_3m.dat @@ -13,8 +13,8 @@ F LOGICAL FLAG, T TO TEST ERROR EXITS. 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. -ZHEMM3M F PUT F FOR NO TEST. SAME COLUMNS. -ZSYMM3M F PUT F FOR NO TEST. SAME COLUMNS. +ZHEMM F PUT F FOR NO TEST. SAME COLUMNS. +ZSYMM F PUT F FOR NO TEST. SAME COLUMNS. ZTRMM F PUT F FOR NO TEST. SAME COLUMNS. ZTRSM F PUT F FOR NO TEST. SAME COLUMNS. ZHERK F PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/zblat3_3m.f b/test/zblat3_3m.f index 9bc412a53..8ec0396c5 100644 --- a/test/zblat3_3m.f +++ b/test/zblat3_3m.f @@ -22,8 +22,8 @@ * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. -* ZHEMM3M T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYMM3M T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. * ZHERK T PUT F FOR NO TEST. SAME COLUMNS. @@ -95,7 +95,7 @@ COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. - DATA SNAMES/'ZGEMM3M ', 'ZHEMM3M ', 'ZSYMM3M ', + DATA SNAMES/'ZGEMM3M ', 'ZHEMM ', 'ZSYMM ', $ 'ZTRMM ', $ 'ZTRSM ', 'ZHERK ', 'ZSYRK ', 'ZHER2K', $ 'ZSYR2K'/ @@ -290,7 +290,7 @@ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 -* Test ZHEMM3M, 02, ZSYMM3M, 03. +* Test ZHEMM, 02, ZSYMM, 03. 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, @@ -654,7 +654,7 @@ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * -* Tests ZHEMM3M and ZSYMM3M. +* Tests ZHEMM and ZSYMM. * * Auxiliary routine for test program for Level 3 Blas. * @@ -697,7 +697,7 @@ LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. - EXTERNAL ZHEMM3M, ZMAKE, ZMMCH, ZSYMM3M + EXTERNAL ZHEMM, ZMAKE, ZMMCH, ZSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. @@ -812,10 +812,10 @@ IF( REWI ) $ REWIND NTRA IF( CONJ )THEN - CALL ZHEMM3M( SIDE, UPLO, M, N, ALPHA, AA, LDA, + CALL ZHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) ELSE - CALL ZSYMM3M( SIDE, UPLO, M, N, ALPHA, AA, LDA, + CALL ZSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) END IF * @@ -1974,7 +1974,7 @@ * .. Local Arrays .. COMPLEX*16 A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) * .. External Subroutines .. - EXTERNAL ZGEMM3M, ZHEMM3M, ZHER2K, ZHERK, CHKXER, ZSYMM3M, + EXTERNAL ZGEMM3M, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM, $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR @@ -2169,137 +2169,137 @@ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 20 INFOT = 1 - CALL ZHEMM3M( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL ZHEMM3M( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZHEMM3M( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZHEMM3M( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZHEMM3M( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZHEMM3M( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZHEMM3M( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZHEMM3M( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZHEMM3M( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZHEMM3M( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL ZHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL ZHEMM3M( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL ZHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL ZHEMM3M( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL ZHEMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL ZHEMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL ZHEMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL ZHEMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 30 INFOT = 1 - CALL ZSYMM3M( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL ZSYMM3M( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZSYMM3M( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZSYMM3M( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZSYMM3M( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZSYMM3M( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZSYMM3M( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZSYMM3M( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZSYMM3M( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZSYMM3M( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL ZSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL ZSYMM3M( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL ZSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 - CALL ZSYMM3M( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL ZSYMM3M( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL ZSYMM3M( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL ZSYMM3M( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 - CALL ZSYMM3M( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 40 INFOT = 1 From 9e829ce98f18937b5abad9f0e050e73572f32be4 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sat, 20 Sep 2014 17:20:02 +0200 Subject: [PATCH 113/119] enabled cblas gemm3m functions --- interface/Makefile | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/interface/Makefile b/interface/Makefile index 65f4e446d..54699b7e3 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -334,6 +334,16 @@ CZBLAS3OBJS = \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) +ifeq ($(SUPPORT_GEMM3M), 1) + +# CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) +CCBLAS3OBJS += cblas_cgemm3m.$(SUFFIX) + +# ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) +CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX) + +endif + ifndef NO_CBLAS @@ -1777,6 +1787,13 @@ cblas_cher2k.$(SUFFIX) cblas_cher2k.$(PSUFFIX) : syr2k.c cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) +cblas_cgemm3m.$(SUFFIX) cblas_cgemm3m.$(PSUFFIX) : gemm.c + $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +cblas_zgemm3m.$(SUFFIX) cblas_zgemm3m.$(PSUFFIX) : gemm.c + $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F) + + sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : lapack/getf2.c $(CC) -c $(CFLAGS) $< -o $(@F) From 7f234f8ed14133896577920c9d501ea4ea2d0a64 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 21 Sep 2014 10:55:08 +0200 Subject: [PATCH 114/119] added GEMM3M tests --- cblas.h | 3 + ctest/Makefile | 20 + ctest/c_c3chke.c | 234 +++- ctest/c_cblas3.c | 82 ++ ctest/c_cblat3_3m.f | 2786 ++++++++++++++++++++++++++++++++++++++++++ ctest/c_z3chke.c | 238 +++- ctest/c_zblas3.c | 79 ++ ctest/c_zblat3_3m.f | 2791 +++++++++++++++++++++++++++++++++++++++++++ ctest/cblas_test.h | 6 + ctest/cin3_3m | 22 + ctest/zin3_3m | 22 + exports/gensymbol | 2 +- test/zblat3_3m.dat | 4 +- test/zblat3_3m.f | 50 +- 14 files changed, 6307 insertions(+), 32 deletions(-) create mode 100644 ctest/c_cblat3_3m.f create mode 100644 ctest/c_zblat3_3m.f create mode 100644 ctest/cin3_3m create mode 100644 ctest/zin3_3m diff --git a/cblas.h b/cblas.h index 2d46049d2..d772fde3f 100644 --- a/cblas.h +++ b/cblas.h @@ -245,6 +245,9 @@ void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); +void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); + void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); diff --git a/ctest/Makefile b/ctest/Makefile index 70d3f9712..1d9567150 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -74,6 +74,18 @@ else OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3 endif +all3_3m: xzcblat3_3m xccblat3_3m +ifeq ($(USE_OPENMP), 1) + OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m + OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m +else + OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m + OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m +endif + + + + clean :: rm -f x* @@ -103,6 +115,9 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xccblat3_3m: $(ctestl3o) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + # Double complex xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) @@ -111,4 +126,9 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + +xzcblat3_3m: $(ztestl3o) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + + include $(TOPDIR)/Makefile.tail diff --git a/ctest/c_c3chke.c b/ctest/c_c3chke.c index 1c133fb9b..4d5de5150 100644 --- a/ctest/c_c3chke.c +++ b/ctest/c_c3chke.c @@ -45,8 +45,238 @@ void F77_c3chke(char * rout) { F77_xerbla(cblas_rout,&cblas_info); } - if (strncmp( sf,"cblas_cgemm" ,11)==0) { - cblas_rout = "cblas_cgemm" ; + + if (strncmp( sf,"cblas_cgemm3m" ,13)==0) { + cblas_rout = "cblas_cgemm3" ; + + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cgemm" ,11)==0) { + cblas_rout = "cblas_cgemm" ; + cblas_info = 1; cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, diff --git a/ctest/c_cblas3.c b/ctest/c_cblas3.c index 0b2f6b966..f1b108c64 100644 --- a/ctest/c_cblas3.c +++ b/ctest/c_cblas3.c @@ -88,6 +88,7 @@ void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n, cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } + void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, @@ -563,3 +564,84 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } + + + +void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} + + diff --git a/ctest/c_cblat3_3m.f b/ctest/c_cblat3_3m.f new file mode 100644 index 000000000..68dd49859 --- /dev/null +++ b/ctest/c_cblat3_3m.f @@ -0,0 +1,2786 @@ + PROGRAM CBLAT3 +* +* Test program for the COMPLEX Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A13, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 22 lines: +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*13 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*13 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*13 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_cgemm3m ', 'cblas_chemm ', + $ 'cblas_csymm ', 'cblas_ctrmm ', 'cblas_ctrsm ', + $ 'cblas_cherk ', 'cblas_csyrk ', 'cblas_cher2k', + $ 'cblas_csyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from CMMCH CT holds +* the result computed by CMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CC3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test CGEMM, 01. + 140 IF (CORDER) THEN + CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CHEMM, 02, CSYMM, 03. + 150 IF (CORDER) THEN + CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CTRMM, 04, CTRSM, 05. + 160 IF (CORDER) THEN + CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test CHERK, 06, CSYRK, 07. + 170 IF (CORDER) THEN + CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CHER2K, 08, CSYR2K, 09. + 180 IF (CORDER) THEN + CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT(' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT(' SUBPROGRAM NAME ', A13,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT(' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, + $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A13,L2 ) + 9987 FORMAT( 1X, A13,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT3. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGEMM3M, CMAKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL CPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCGEMM3M( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LCERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A13,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END +* + SUBROUTINE CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*13 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', + $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) + END +* + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CHEMM and CSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHEMM, CMAKE, CMMCH, CCSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL CMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL CPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CCHEMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + ELSE + CALL CCSYMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END +* + SUBROUTINE CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*13 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, + $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) + END +* + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests CTRMM and CTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS + REAL ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMMCH, CCTRMM, CCTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for CMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL CMAKE( 'tr', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CCTRMM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CCTRSM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LCE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LCE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT(' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A13,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END +* + SUBROUTINE CPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + COMPLEX ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*13 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A13,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', + $ F4.1, '), A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CHERK and CSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHERK, CMAKE, CMMCH, CCSYRK +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = REAL( ALPHA ) + ALPHA = CMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL CPRCN6( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, + $ LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCHERK( IORDER, UPLO, TRANS, N, K, + $ RALPHA, AA, LDA, RBETA, CC, + $ LDC ) + ELSE + IF( TRACE ) + $ CALL CPRCN4( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCSYRK( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LCERES( SNAME( 8: 9 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL CMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL CPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, + $ LDA, rBETA, LDC) + ELSE + CALL CPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END +* + SUBROUTINE CPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*13 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', + $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE CPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*13 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests CHER2K and CSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER2K, CMAKE, CMMCH, CCSYR2K +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL CPRCN7( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ RBETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCHER2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, RBETA, + $ CC, LDC ) + ELSE + IF( TRACE ) + $ CALL CPRCN5( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'he', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = CONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*CONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = CONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL CPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, RBETA, LDC) + ELSE + CALL CPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, BETA, LDC) + END IF +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END +* + SUBROUTINE CPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*13 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE CPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + COMPLEX ALPHA + REAL BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*13 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE CMAKE(TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'he', 'sy' or 'tr'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, REAL +* .. Executable Statements .. + GEN = TYPE.EQ.'ge' + HER = TYPE.EQ.'he' + SYM = TYPE.EQ.'sy' + TRI = TYPE.EQ.'tr' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = CBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END + SUBROUTINE CMMCH(TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX CL + REAL ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )* + $ CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of CMMCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge' or 'he' or 'sy'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/ctest/c_z3chke.c b/ctest/c_z3chke.c index df2513514..4be4457b4 100644 --- a/ctest/c_z3chke.c +++ b/ctest/c_z3chke.c @@ -45,8 +45,242 @@ void F77_z3chke(char * rout) { F77_xerbla(cblas_rout,&cblas_info); } - if (strncmp( sf,"cblas_zgemm" ,11)==0) { - cblas_rout = "cblas_zgemm" ; + + + + + if (strncmp( sf,"cblas_zgemm3m" ,13)==0) { + cblas_rout = "cblas_zgemm3" ; + + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + + + } else if (strncmp( sf,"cblas_zgemm" ,11)==0) { + cblas_rout = "cblas_zgemm" ; cblas_info = 1; cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, diff --git a/ctest/c_zblas3.c b/ctest/c_zblas3.c index ad744110b..46ff467d0 100644 --- a/ctest/c_zblas3.c +++ b/ctest/c_zblas3.c @@ -562,3 +562,82 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } + + +void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} + diff --git a/ctest/c_zblat3_3m.f b/ctest/c_zblat3_3m.f new file mode 100644 index 000000000..7390d8712 --- /dev/null +++ b/ctest/c_zblat3_3m.f @@ -0,0 +1,2791 @@ + PROGRAM ZBLAT3 +* +* Test program for the COMPLEX*16 Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A13,L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 22 lines: +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*13 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*13 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5,ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*13 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_zgemm3m ', 'cblas_zhemm ', + $ 'cblas_zsymm ', 'cblas_ztrmm ', 'cblas_ztrsm ', + $ 'cblas_zherk ', 'cblas_zsyrk ', 'cblas_zher2k', + $ 'cblas_zsyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from ZMMCH CT holds +* the result computed by ZMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CZ3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test ZGEMM, 01. + 140 IF (CORDER) THEN + CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZHEMM, 02, ZSYMM, 03. + 150 IF (CORDER) THEN + CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZTRMM, 04, ZTRSM, 05. + 160 IF (CORDER) THEN + CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test ZHERK, 06, ZSYRK, 07. + 170 IF (CORDER) THEN + CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZHER2K, 08, ZSYR2K, 09. + 180 IF (CORDER) THEN + CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT('TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT(' SUBPROGRAM NAME ', A13,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT(' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, + $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A13,L2 ) + 9987 FORMAT( 1X, A13,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT3. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGEMM3M, ZMAKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL ZPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZGEMM3M( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LZERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A13,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END +* + SUBROUTINE ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*13 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', A15, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', + $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) + END +* + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZHEMM and ZSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHEMM, ZMAKE, ZMMCH, CZSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL ZMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CZHEMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + ELSE + CALL CZSYMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK2. +* + END +* + SUBROUTINE ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*13 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', A15, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, + $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) + END +* + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests ZTRMM and ZTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMMCH, CZTRMM, CZTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for ZMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL ZMAKE( 'tr', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CZTRMM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CZTRSM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LZE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LZE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT(' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A13,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END +* + SUBROUTINE ZPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + DOUBLE COMPLEX ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*13 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A13,'(', A15, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( A15, ',') , 2( I3, ',' ), ' (', F4.1, ',', + $ F4.1, '), A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZHERK and ZSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHERK, ZMAKE, ZMMCH, CZSYRK +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = DBLE( ALPHA ) + ALPHA = DCMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL ZPRCN6( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, + $ LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZHERK( IORDER, UPLO, TRANS, N, K, + $ RALPHA, AA, LDA, RBETA, CC, + $ LDC ) + ELSE + IF( TRACE ) + $ CALL ZPRCN4( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZSYRK( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LZERES( SNAME( 8: 9 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL ZMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL ZPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, + $ LDA, rBETA, LDC) + ELSE + CALL ZPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END +* + SUBROUTINE ZPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*13 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', + $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE ZPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*13 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests ZHER2K and ZSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*13 SNAME +* .. Array Arguments .. + COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER2K, ZMAKE, ZMMCH, CZSYR2K +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL ZPRCN7( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ RBETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZHER2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, RBETA, + $ CC, LDC ) + ELSE + IF( TRACE ) + $ CALL ZPRCN5( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'he', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = DCONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*DCONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = DCONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL ZPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, RBETA, LDC) + ELSE + CALL ZPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, BETA, LDC) + END IF +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK5. +* + END +* + SUBROUTINE ZPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*13 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE ZPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA + DOUBLE PRECISION BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*13 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'he', 'sy' or 'tr'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, DBLE +* .. Executable Statements .. + GEN = TYPE.EQ.'ge' + HER = TYPE.EQ.'he' + SYM = TYPE.EQ.'sy' + TRI = TYPE.EQ.'tr' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END + SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 CL + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of ZMMCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge' or 'he' or 'sy'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + diff --git a/ctest/cblas_test.h b/ctest/cblas_test.h index 53cb99f9e..3eeb46ac2 100644 --- a/ctest/cblas_test.h +++ b/ctest/cblas_test.h @@ -173,12 +173,14 @@ typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX; #define F77_dtrmm cdtrmm_ #define F77_dtrsm cdtrsm_ #define F77_cgemm ccgemm_ + #define F77_cgemm3m ccgemm3m_ #define F77_csymm ccsymm_ #define F77_csyrk ccsyrk_ #define F77_csyr2k ccsyr2k_ #define F77_ctrmm cctrmm_ #define F77_ctrsm cctrsm_ #define F77_zgemm czgemm_ + #define F77_zgemm3m czgemm3m_ #define F77_zsymm czsymm_ #define F77_zsyrk czsyrk_ #define F77_zsyr2k czsyr2k_ @@ -333,12 +335,14 @@ typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX; #define F77_dtrmm CDTRMM #define F77_dtrsm CDTRSM #define F77_cgemm CCGEMM + #define F77_cgemm3m CCGEMM3M #define F77_csymm CCSYMM #define F77_csyrk CCSYRK #define F77_csyr2k CCSYR2K #define F77_ctrmm CCTRMM #define F77_ctrsm CCTRSM #define F77_zgemm CZGEMM + #define F77_zgemm3m CZGEMM3M #define F77_zsymm CZSYMM #define F77_zsyrk CZSYRK #define F77_zsyr2k CZSYR2K @@ -493,12 +497,14 @@ typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX; #define F77_dtrmm cdtrmm #define F77_dtrsm cdtrsm #define F77_cgemm ccgemm + #define F77_cgemm3m ccgemm3m #define F77_csymm ccsymm #define F77_csyrk ccsyrk #define F77_csyr2k ccsyr2k #define F77_ctrmm cctrmm #define F77_ctrsm cctrsm #define F77_zgemm czgemm + #define F77_zgemm3m czgemm3m #define F77_zsymm czsymm #define F77_zsyrk czsyrk #define F77_zsyr2k czsyr2k diff --git a/ctest/cin3_3m b/ctest/cin3_3m new file mode 100644 index 000000000..34014143e --- /dev/null +++ b/ctest/cin3_3m @@ -0,0 +1,22 @@ +'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_cgemm3m T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chemm F PUT F FOR NO TEST. SAME COLUMNS. +cblas_csymm F PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrmm F PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrsm F PUT F FOR NO TEST. SAME COLUMNS. +cblas_cherk F PUT F FOR NO TEST. SAME COLUMNS. +cblas_csyrk F PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher2k F PUT F FOR NO TEST. SAME COLUMNS. +cblas_csyr2k F PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/zin3_3m b/ctest/zin3_3m new file mode 100644 index 000000000..33bf08353 --- /dev/null +++ b/ctest/zin3_3m @@ -0,0 +1,22 @@ +'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_zgemm3m T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhemm F PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsymm F PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrmm F PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrsm F PUT F FOR NO TEST. SAME COLUMNS. +cblas_zherk F PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsyrk F PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher2k F PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsyr2k F PUT F FOR NO TEST. SAME COLUMNS. diff --git a/exports/gensymbol b/exports/gensymbol index 2a50a8df2..bcea83667 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -54,7 +54,7 @@ cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, - cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy + cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, ); @exblasobjs = ( diff --git a/test/zblat3_3m.dat b/test/zblat3_3m.dat index f48cc19db..9c8412114 100644 --- a/test/zblat3_3m.dat +++ b/test/zblat3_3m.dat @@ -13,8 +13,8 @@ F LOGICAL FLAG, T TO TEST ERROR EXITS. 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. -ZHEMM F PUT F FOR NO TEST. SAME COLUMNS. -ZSYMM F PUT F FOR NO TEST. SAME COLUMNS. +ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. ZTRMM F PUT F FOR NO TEST. SAME COLUMNS. ZTRSM F PUT F FOR NO TEST. SAME COLUMNS. ZHERK F PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/zblat3_3m.f b/test/zblat3_3m.f index 8ec0396c5..bac23aa54 100644 --- a/test/zblat3_3m.f +++ b/test/zblat3_3m.f @@ -4,7 +4,7 @@ * * The program must be driven by a short data file. The first 14 records * of the file are read using list-directed input, the last 9 records -* are read using the format ( A6, L2 ). An annotated example of a data +* are read using the format ( A8, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 23 lines: * 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE @@ -348,7 +348,7 @@ $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) - 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + 9990 FORMAT( ' SUBPROGRAM NAME ', A8, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT( ' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, @@ -912,15 +912,15 @@ 120 CONTINUE RETURN * - 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) - 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) - 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) - 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', @@ -1223,15 +1223,15 @@ 160 CONTINUE RETURN * - 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) - 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) - 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) - 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A8, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', $ ' .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', @@ -1553,19 +1553,19 @@ 130 CONTINUE RETURN * - 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) - 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) - 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) - 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) - 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, $ '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', @@ -1927,19 +1927,19 @@ 160 CONTINUE RETURN * - 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) - 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) - 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) - 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, $ ', C,', I3, ') .' ) - 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', @@ -2867,8 +2867,8 @@ END IF RETURN * - 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) - 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + 9999 FORMAT( ' ', A8, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A8, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of ZCHKE. @@ -3385,7 +3385,7 @@ RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', - $ 'ETECTED BY ', A6, ' *****' ) + $ 'ETECTED BY ', A8, ' *****' ) * * End of CHKXER. * @@ -3437,8 +3437,8 @@ * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) - 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', - $ 'AD OF ', A6, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A8, ' INSTE', + $ 'AD OF ', A8, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * From 9d7057366d0be006551c0574cdd7a4875cd6422c Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 21 Sep 2014 11:41:43 +0200 Subject: [PATCH 115/119] bugfix for GEMM3M functions --- param.h | 17 ++++++++--------- test/zblat3_3m.dat | 4 ++-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/param.h b/param.h index 4adb0a1de..3e20f5882 100644 --- a/param.h +++ b/param.h @@ -1168,10 +1168,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 192 #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 4 -#define CGEMM3M_DEFAULT_UNROLL_M 8 -#define ZGEMM3M_DEFAULT_UNROLL_N 2 -#define ZGEMM3M_DEFAULT_UNROLL_M 8 +#define CGEMM3M_DEFAULT_UNROLL_N 8 +#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define ZGEMM3M_DEFAULT_UNROLL_N 8 +#define ZGEMM3M_DEFAULT_UNROLL_M 2 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 @@ -1287,11 +1287,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 4 -#define CGEMM3M_DEFAULT_UNROLL_M 8 -#define ZGEMM3M_DEFAULT_UNROLL_N 2 -#define ZGEMM3M_DEFAULT_UNROLL_M 8 - +#define CGEMM3M_DEFAULT_UNROLL_N 8 +#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define ZGEMM3M_DEFAULT_UNROLL_N 8 +#define ZGEMM3M_DEFAULT_UNROLL_M 2 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 diff --git a/test/zblat3_3m.dat b/test/zblat3_3m.dat index 9c8412114..f48cc19db 100644 --- a/test/zblat3_3m.dat +++ b/test/zblat3_3m.dat @@ -13,8 +13,8 @@ F LOGICAL FLAG, T TO TEST ERROR EXITS. 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. -ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. -ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +ZHEMM F PUT F FOR NO TEST. SAME COLUMNS. +ZSYMM F PUT F FOR NO TEST. SAME COLUMNS. ZTRMM F PUT F FOR NO TEST. SAME COLUMNS. ZTRSM F PUT F FOR NO TEST. SAME COLUMNS. ZHERK F PUT F FOR NO TEST. SAME COLUMNS. From dab4edd0693a6f4b40b49e97b32bdb455307dac5 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 21 Sep 2014 12:00:41 +0200 Subject: [PATCH 116/119] added benchmark for gemm3m functions --- benchmark/Makefile | 26 ++++++ benchmark/gemm3m.c | 212 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 benchmark/gemm3m.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 07bf5a792..cf219cef1 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -112,6 +112,11 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + +goto_3m :: cgemm3m.goto zgemm3m.goto + +mkl_3m :: cgemm3m.mkl zgemm3m.mkl + all :: goto mkl atlas acml ##################################### Slinpack #################################################### @@ -1043,6 +1048,22 @@ zaxpy.mkl : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Cgemm3m #################################################### + +cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +cgemm3m.mkl : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemm3m #################################################### + +zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zgemm3m.mkl : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ################################################################################################### @@ -1250,6 +1271,11 @@ caxpy.$(SUFFIX) : axpy.c zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +cgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ clean :: diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c new file mode 100644 index 000000000..048d74be6 --- /dev/null +++ b/benchmark/gemm3m.c @@ -0,0 +1,212 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef GEMM + +#ifndef COMPLEX + +#ifdef DOUBLE +#define GEMM BLASFUNC(dgemm) +#else +#define GEMM BLASFUNC(sgemm) +#endif + +#else + +#ifdef DOUBLE +#define GEMM BLASFUNC(zgemm3m) +#else +#define GEMM BLASFUNC(cgemm3m) +#endif + +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int MAIN__(int argc, char *argv[]){ + + FLOAT *a, *b, *c; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char trans='N'; + blasint m, i, j; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + + fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + p = getenv("OPENBLAS_LOOPS"); + if ( p != NULL ) + loops = atoi(p); + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + for (l=0; l Date: Sun, 21 Sep 2014 13:39:15 +0200 Subject: [PATCH 117/119] updated cblas.h and cblas_noconst.h --- cblas.h | 2 ++ cblas_noconst.h | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/cblas.h b/cblas.h index d772fde3f..d1c029afa 100644 --- a/cblas.h +++ b/cblas.h @@ -243,6 +243,8 @@ void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); +void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, diff --git a/cblas_noconst.h b/cblas_noconst.h index 884d9f63f..bc6382513 100644 --- a/cblas_noconst.h +++ b/cblas_noconst.h @@ -231,8 +231,12 @@ void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); From f1b9a4a1ca79d2abb924a6effa9aaed67eacb427 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 23 Sep 2014 11:34:29 +0200 Subject: [PATCH 118/119] Ref #454: fixed bug in common_param.h --- common_param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common_param.h b/common_param.h index e9f35c033..49c1bf73b 100644 --- a/common_param.h +++ b/common_param.h @@ -1002,7 +1002,7 @@ extern gotoblas_t *gotoblas; #define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N #define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) -#ifdef CGEMM_DEFAULT_UNROLL_N +#ifdef CGEMM3M_DEFAULT_UNROLL_N #define CGEMM3M_P CGEMM3M_DEFAULT_P #define CGEMM3M_Q CGEMM3M_DEFAULT_Q @@ -1023,7 +1023,7 @@ extern gotoblas_t *gotoblas; #endif -#ifdef ZGEMM_DEFAULT_UNROLL_N +#ifdef ZGEMM3M_DEFAULT_UNROLL_N #define ZGEMM3M_P ZGEMM3M_DEFAULT_P #define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q From ac5a7e1c1bb75d1accfb83c394e1535b5ff170d2 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 13 Oct 2014 17:10:12 +0800 Subject: [PATCH 119/119] Update dot to 0.2.12 version. --- Changelog.txt | 15 +++++++++++++++ Makefile.rule | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index 4fd849048..e9fe824ca 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,19 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.12 +13-Oct-2014 +common: + * Added CBLAS interface for ?omatcopy and ?imatcopy. + * Enable ?gemm3m functions. + * Added benchmark for ?gemm3m. + * Optimized multithreading lower limits. + * Disabled SYMM3M and HEMM3M functions + because of segment violations. + +x86/x86-64: + * Improved axpy and symv performance on AMD Bulldozer. + * Improved gemv performance on modern Intel and AMD CPUs. + ==================================================================== Version 0.2.11 18-Aug-2014 diff --git a/Makefile.rule b/Makefile.rule index 7430320b7..7f0356fff 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.11 +VERSION = 0.2.12 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library