From 5118a7f4d1f701a23f76c7c4cbcdead7d5566b86 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Thu, 31 Oct 2013 11:53:26 +0100 Subject: [PATCH] small optimizations on dgemm_kernel for Piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 5 +- kernel/x86_64/dgemm_kernel_8x2_piledriver.S | 142 +++++--------------- param.h | 2 +- 3 files changed, 41 insertions(+), 108 deletions(-) diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 6c262c774..abed953c3 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -54,9 +54,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S index c719e96fc..cc0ebef8a 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -28,37 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /********************************************************************* * -* 2013/10/18 Saar +* 2013/10/31 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * -* 2013/10/18 Saar +* 2013/10/31 Saar * * Parameter: * UNROLL_M 8 * UNROLL_N 2 -* DGEMM_P 384 +* DGEMM_P 768 * DGEMM_Q 168 * DGEMM_R 12288 -* A_PR1 256 +* A_PR1 512 +* B_PR1 256 * * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * -* 6144x6912 84.1 GFLOPS with 8 threads on 4 modules (ACML: 81.4 GFLOPS) -* 6144x6912 81.2 GFLOPS with 4 threads on 4 modules (ACML: 81.3 GFLOPS) -* 6144x6912 40.9 GFLOPS with 2 threads on 2 modules (ACML: 41.8 GFLOPS) -* 6144x6912 20.5 GFLOPS with 1 threads on 1 modules (ACML: 21.0 GFLOPS) +* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) +* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) * * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * -* 12288x13824 244.5 GFLOPS with 32 threads on 16 modules (ACML: 120.3 GFLOPS) !strange thermal behavior -* 12288x13824 233.9 GFLOPS with 16 threads on 16 modules (ACML: 129.5 GFLOPS) !strange thermal behavior -* 12288x13824 138.1 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) -* 6144x6912 73.6 GFLOPS with 4 threads on 4 modules (ACML: 59.4 GFLOPS) -* 6144x6912 36.8 GFLOPS with 2 threads on 2 modules (ACML: 34.9 GFLOPS) -* 6144x6912 18.7 GFLOPS with 1 threads on 1 modules (ACML: 18.7 GFLOPS) +* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior +* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior +* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) +* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) +* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) +* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) * *********************************************************************/ @@ -168,9 +169,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define A_PR1 256 +#define A_PR1 512 #define B_PR1 256 -#define C_PR1 256 +#define C_PR1 64 .macro INIT8x3 vxorpd %xmm4 , %xmm4 , %xmm4 @@ -454,9 +455,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE8x3 vmovddup ALPHA, %xmm0 - prefetcht0 C_PR1(CO1) - prefetcht0 C_PR1(CO1,LDC) - prefetcht0 C_PR1(CO1,LDC,2) vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 @@ -489,6 +487,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + prefetcht0 C_PR1(CO1) + prefetcht0 C_PR1(CO1,LDC) + prefetcht0 C_PR1(CO1,LDC,2) + addq $8 * SIZE, CO1 # coffset += 8 .endm @@ -1284,6 +1286,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpq $3, %rax jl .L6_13 + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 @@ -1299,22 +1304,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_12: + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 + prefetcht0 B_PR1+104(BO) KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 dec %rax - //je .L6_12_E - jne .L6_12 .L6_12_E: + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 @@ -1432,31 +1440,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) @@ -1548,31 +1550,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) @@ -1651,31 +1647,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L6_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) @@ -1753,6 +1743,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpq $3, %rax jl .L7_13 + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 @@ -1768,22 +1761,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_12: + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 + prefetcht0 B_PR1+104(BO) KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 dec %rax - //je .L7_12_E - jne .L7_12 .L7_12_E: + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 @@ -1904,31 +1900,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) @@ -2019,31 +2009,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) @@ -2127,31 +2111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L7_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) - //prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) - //prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) @@ -2277,13 +2255,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -2291,13 +2267,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -2399,13 +2373,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -2413,13 +2385,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -2503,13 +2473,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -2517,13 +2485,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -2600,13 +2566,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -2614,13 +2578,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -2743,7 +2705,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -2756,7 +2717,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -2851,7 +2811,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -2864,7 +2823,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -2946,7 +2904,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) @@ -3036,7 +2993,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) @@ -3049,7 +3005,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) @@ -3317,13 +3272,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -3331,13 +3284,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) @@ -3502,13 +3453,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -3516,13 +3465,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) @@ -3667,13 +3614,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -3681,13 +3626,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_36 - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) @@ -3818,13 +3761,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L2_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -3832,13 +3773,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L2_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) @@ -4023,7 +3962,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_12: - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -4036,7 +3974,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_16 - //prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) @@ -4186,7 +4123,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_22: - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -4199,7 +4135,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_26 - //prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) @@ -4335,7 +4270,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_32: - //prefetcht0 B_PR1(BO,BI,8) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) @@ -4476,7 +4410,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L1_42: - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) @@ -4489,7 +4422,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. je .L1_46 - //prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) diff --git a/param.h b/param.h index a6990f7c3..c4d15323a 100644 --- a/param.h +++ b/param.h @@ -347,7 +347,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 768 #define ZGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 768 #else