From afe44b0241864889918a5f3390950574fad84657 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 28 Oct 2013 14:23:48 +0100 Subject: [PATCH] tests and code cleanup of gemm_kernels for HASWELL --- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 93 ++++++++++------------ kernel/x86_64/sgemm_kernel_16x4_haswell.S | 93 +++++++++++----------- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 96 +++++++++++------------ param.h | 8 +- 4 files changed, 135 insertions(+), 155 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index bac773969..9729e6d70 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -1,61 +1,51 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ /********************************************************************* -* 2013/10/19 Saar -* BLASTEST : +* 2013/10/28 Saar +* BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/08/16 Saar +* 2013/10/28 Saar * Parameter: * CGEMM_DEFAULT_UNROLL_N 2 * CGEMM_DEFAULT_UNROLL_M 8 -* CGEMM_DEFAULT_P 224 -* CGEMM_DEFAULT_Q 224 +* CGEMM_DEFAULT_P 384 +* CGEMM_DEFAULT_Q 192 +* A_PR1 512 +* B_PR1 512 * -* BLASTEST: OK +* Performance at 6912x6912x6912: +* 1 thread: 84 GFLOPS (SANDYBRIDGE: 60) (MKL: 86) +* 2 threads: 153 GFLOPS (SANDYBRIDGE: 114) (MKL: 155) +* 3 threads: 224 GFLOPS (SANDYBRIDGE: 162) (MKL: 222) +* 4 threads: 278 GFLOPS (SANDYBRIDGE: 223) (MKL: 279) * -* Performance: -* 1 thread: 2.04 times faster than sandybridge -* 4 threads: 1.96 times faster than sandybridge -* -* Compile for FMA3: OK * *********************************************************************/ @@ -235,8 +225,8 @@ #endif -#define A_PR1 384 -#define B_PR1 192 +#define A_PR1 512 +#define B_PR1 512 /***************************************************************************************************************************/ @@ -338,6 +328,9 @@ vmovups %ymm10 , (CO1, LDC) vmovups %ymm14 , 8 * SIZE(CO1, LDC) + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + .endm /***************************************************************************************************************************/ diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index c6489277d..78adbafbb 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -1,61 +1,51 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ /********************************************************************* -* 2013/10/19 Saar -* BLASTEST : +* 2013/10/28 Saar +* BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/08/15 Saar +* 2013/10/28 Saar * Parameter: * SGEMM_DEFAULT_UNROLL_N 4 * SGEMM_DEFAULT_UNROLL_M 16 * SGEMM_DEFAULT_P 768 -* SGEMM_DEFAULT_Q 168 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 * -* BLASTEST: OK -* -* Performance: -* 1 thread: 2.22 times faster than sandybridge -* 4 threads: 2.26 times faster than sandybridge * -* Compile for FMA3: OK +* Performance at 9216x9216x9216: +* 1 thread: 86 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 157 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 235 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 288 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) * *********************************************************************/ @@ -162,8 +152,8 @@ #endif -#define A_PR1 384 -#define B_PR1 192 +#define A_PR1 512 +#define B_PR1 512 /******************************************************************************************* * 4 lines of N @@ -230,6 +220,11 @@ vmovups %ymm10, (CO2, LDC) vmovups %ymm11, 8 * SIZE(CO2, LDC) + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + .endm diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index f4b8142ce..949f90bea 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -1,62 +1,53 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ -/********************************************************************* -* 2013/10/19 Saar -* BLASTEST : +/******************************************************************************** +* 2013/10/28 Saar +* BLASTEST : OK * CTEST : OK * TEST : OK * -* 2013/08/16 Saar +* 2013/10/28 Saar * Parameter: * ZGEMM_DEFAULT_UNROLL_N 2 * ZGEMM_DEFAULT_UNROLL_M 4 -* ZGEMM_DEFAULT_P 112 -* ZGEMM_DEFAULT_Q 224 +* ZGEMM_DEFAULT_P 256 +* ZGEMM_DEFAULT_Q 128 +* A_PR1 512 +* B_PR1 512 * * -* Performance: -* 1 thread: 1.80 times faster than sandybridge -* 4 threads: 1.74 times faster than sandybridge +* Performance at 4608x4608x4608: +* 1 thread: 43 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) +* 2 threads: 85 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) +* 3 threads: 122 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) +* 4 threads: 156 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) * -* Compile for FMA3: OK -* -*********************************************************************/ +********************************************************************************/ #define ASSEMBLER @@ -232,8 +223,8 @@ #endif -#define A_PR1 384 -#define B_PR1 192 +#define A_PR1 512 +#define B_PR1 512 /***************************************************************************************************/ .macro KERNEL4x2_SUB @@ -335,7 +326,8 @@ vmovups %ymm10 , (CO1, LDC) vmovups %ymm14 , 4 * SIZE(CO1, LDC) - + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) .endm diff --git a/param.h b/param.h index b6c1f0301..dd613fbf1 100644 --- a/param.h +++ b/param.h @@ -1223,12 +1223,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 384 -#define ZGEMM_DEFAULT_P 192 +#define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 168 +#define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 -#define CGEMM_DEFAULT_Q 168 -#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r //#define DGEMM_DEFAULT_R dgemm_r