diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S index b274f7655..47771faf8 100644 --- a/kernel/power/gemm_kernel_power6.S +++ b/kernel/power/gemm_kernel_power6.S @@ -864,15 +864,15 @@ LL(22): LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) - FMADD f2, f18, f24, f2 - FMADD f3, f19, f24, f3 - FMADD f6, f18, f25, f6 - FMADD f7, f19, f25, f7 + FMADD f0, f18, f24, f0 + FMADD f1, f19, f24, f1 + FMADD f4, f18, f25, f4 + FMADD f5, f19, f25, f5 - FMADD f10, f18, f26, f10 - FMADD f11, f19, f26, f11 - FMADD f14, f18, f27, f14 - FMADD f15, f19, f27, f15 + FMADD f8, f18, f26, f8 + FMADD f9, f19, f26, f9 + FMADD f12, f18, f27, f12 + FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -899,15 +899,15 @@ LL(22): LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) - FMADD f2, f18, f24, f2 - FMADD f3, f19, f24, f3 - FMADD f6, f18, f25, f6 - FMADD f7, f19, f25, f7 + FMADD f0, f18, f24, f0 + FMADD f1, f19, f24, f1 + FMADD f4, f18, f25, f4 + FMADD f5, f19, f25, f5 - FMADD f10, f18, f26, f10 - FMADD f11, f19, f26, f11 - FMADD f14, f18, f27, f14 - FMADD f15, f19, f27, f15 + FMADD f8, f18, f26, f8 + FMADD f9, f19, f26, f9 + FMADD f12, f18, f27, f12 + FMADD f13, f19, f27, f13 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -923,14 +923,6 @@ LL(22): addi BO, BO, 16 * SIZE bdnz LL(22) - fadd f0, f2, f0 - fadd f1, f3, f1 - fadd f4, f6, f4 - fadd f5, f7, f5 - fadd f8, f10, f8 - fadd f9, f11, f9 - fadd f12, f14, f12 - fadd f13, f15, f13 .align 4 LL(25): @@ -1161,10 +1153,10 @@ LL(32): LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) - FMADD f1, f17, f24, f1 - FMADD f5, f17, f25, f5 - FMADD f9, f17, f26, f9 - FMADD f13, f17, f27, f13 + FMADD f0, f17, f24, f0 + FMADD f4, f17, f25, f4 + FMADD f8, f17, f26, f8 + FMADD f12, f17, f27, f12 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) @@ -1181,10 +1173,10 @@ LL(32): LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) - FMADD f1, f19, f24, f1 - FMADD f5, f19, f25, f5 - FMADD f9, f19, f26, f9 - FMADD f13, f19, f27, f13 + FMADD f0, f19, f24, f0 + FMADD f4, f19, f25, f4 + FMADD f8, f19, f26, f8 + FMADD f12, f19, f27, f12 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -1200,10 +1192,6 @@ LL(32): addi BO, BO, 16 * SIZE bdnz LL(32) - fadd f0, f1, f0 - fadd f4, f5, f4 - fadd f8, f9, f8 - fadd f12, f13, f12 .align 4 LL(35): @@ -1691,10 +1679,10 @@ LL(52): FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 - FMADD f4, f18, f22, f4 - FMADD f5, f19, f22, f5 - FMADD f6, f18, f23, f6 - FMADD f7, f19, f23, f7 + FMADD f0, f18, f22, f0 + FMADD f1, f19, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -1711,10 +1699,10 @@ LL(52): FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 - FMADD f4, f18, f26, f4 - FMADD f5, f19, f26, f5 - FMADD f6, f18, f27, f6 - FMADD f7, f19, f27, f7 + FMADD f0, f18, f26, f0 + FMADD f1, f19, f26, f1 + FMADD f2, f18, f27, f2 + FMADD f3, f19, f27, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -1775,21 +1763,11 @@ LL(58): LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) - FADD f0, f4, f0 - FADD f1, f5, f1 - FADD f2, f6, f2 - FADD f3, f7, f3 - FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else - FADD f0, f4, f0 - FADD f1, f5, f1 - FADD f2, f6, f2 - FADD f3, f7, f3 - FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 @@ -1916,8 +1894,8 @@ LL(60): LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 - FMADD f2, f17, f22, f2 - FMADD f3, f17, f23, f3 + FMADD f0, f17, f22, f0 + FMADD f1, f17, f23, f1 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) @@ -1926,8 +1904,8 @@ LL(62): FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 - FMADD f2, f19, f26, f2 - FMADD f3, f19, f27, f3 + FMADD f0, f19, f26, f0 + FMADD f1, f19, f27, f1 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -1986,15 +1964,9 @@ LL(68): LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) - FADD f0, f2, f0 - FADD f1, f3, f1 - FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f18 #else - FADD f0, f2, f0 - FADD f1, f3, f1 - FMUL f0, f0, f30 FMUL f1, f1, f30 #endif @@ -2007,7 +1979,6 @@ LL(68): fmr f4, f0 fmr f5, f0 - #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -2332,8 +2303,8 @@ LL(80): LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 - FMADD f2, f18, f21, f2 - FMADD f3, f19, f21, f3 + FMADD f0, f18, f21, f0 + FMADD f1, f19, f21, f1 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2342,8 +2313,8 @@ LL(82): FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 - FMADD f2, f18, f23, f2 - FMADD f3, f19, f23, f3 + FMADD f0, f18, f23, f0 + FMADD f1, f19, f23, f1 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2401,15 +2372,9 @@ LL(88): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) - FADD f0, f2, f0 - FADD f1, f3, f1 - FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 #else - FADD f0, f2, f0 - FADD f1, f3, f1 - FMUL f0, f0, f30 FMUL f1, f1, f30 #endif @@ -2418,9 +2383,6 @@ LL(88): STFD f1, 1 * SIZE(CO1) lfs f0, FZERO - fmr f1, f0 - fmr f2, f0 - fmr f3, f0 addi CO1, CO1, 2 * SIZE @@ -2512,9 +2474,9 @@ LL(90): LL(92): FMADD f0, f16, f20, f0 - FMADD f1, f17, f21, f1 - FMADD f2, f18, f22, f2 - FMADD f3, f19, f23, f3 + FMADD f0, f17, f21, f0 + FMADD f0, f18, f22, f0 + FMADD f0, f19, f23, f0 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2527,9 +2489,9 @@ LL(92): LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 - FMADD f1, f17, f21, f1 - FMADD f2, f18, f22, f2 - FMADD f3, f19, f23, f3 + FMADD f0, f17, f21, f0 + FMADD f0, f18, f22, f0 + FMADD f0, f19, f23, f0 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2583,16 +2545,8 @@ LL(98): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) - FADD f0, f1, f0 - FADD f2, f3, f2 - FADD f0, f2, f0 - FMADD f0, f0, f30, f16 #else - FADD f0, f1, f0 - FADD f2, f3, f2 - FADD f0, f2, f0 - FMUL f0, f0, f30 #endif diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index accdad702..0cabe89e0 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -409,14 +409,6 @@ LL(11): fmr y06, y01 fmr y07, y01 fmr y08, y01 - fmr y09, y01 - fmr y10, y01 - fmr y11, y01 - fmr y12, y01 - fmr y13, y01 - fmr y14, y01 - fmr y15, y01 - fmr y16, y01 DCBT(Y1, PREC) @@ -465,24 +457,24 @@ LL(12): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 3 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 3 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 3 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 3 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 3 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 3 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 3 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -505,24 +497,24 @@ LL(12): FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 5 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 5 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 5 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 5 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 5 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 5 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 5 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 5 * SIZE(AO8) LFD b1, 9 * SIZE(BO) @@ -550,24 +542,24 @@ LL(12): FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 7 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 7 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 7 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 7 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 7 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 7 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 7 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -590,24 +582,24 @@ LL(12): FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) - FMADD y09, a1, b8, y09 + FMADD y01, a1, b8, y01 LFD a1, 9 * SIZE(AO1) - FMADD y10, a2, b8, y10 + FMADD y02, a2, b8, y02 LFD a2, 9 * SIZE(AO2) - FMADD y11, a3, b8, y11 + FMADD y03, a3, b8, y03 LFD a3, 9 * SIZE(AO3) - FMADD y12, a4, b8, y12 + FMADD y04, a4, b8, y04 LFD a4, 9 * SIZE(AO4) - FMADD y13, a5, b8, y13 + FMADD y05, a5, b8, y05 LFD a5, 9 * SIZE(AO5) - FMADD y14, a6, b8, y14 + FMADD y06, a6, b8, y06 LFD a6, 9 * SIZE(AO6) - FMADD y15, a7, b8, y15 + FMADD y07, a7, b8, y07 LFD a7, 9 * SIZE(AO7) - FMADD y16, a8, b8, y16 + FMADD y08, a8, b8, y08 LFD a8, 9 * SIZE(AO8) LFD b5, 13 * SIZE(BO) @@ -640,24 +632,24 @@ LL(12): FMADD y08, a8, b1, y08 LFD a8, 10 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 11 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 11 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 11 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 11 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 11 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 11 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 11 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 11 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -680,24 +672,24 @@ LL(12): FMADD y08, a8, b3, y08 LFD a8, 12 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 13 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 13 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 13 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 13 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 13 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 13 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 13 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 13 * SIZE(AO8) LFD b1, 17 * SIZE(BO) @@ -725,24 +717,24 @@ LL(12): FMADD y08, a8, b5, y08 LFD a8, 14 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 15 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 15 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 15 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 15 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 15 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 15 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 15 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 15 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -765,14 +757,14 @@ LL(12): FMADD y08, a8, b7, y08 LFD a8, 16 * SIZE(AO8) - FMADD y09, a1, b8, y09 + FMADD y01, a1, b8, y01 LFD a1, 17 * SIZE(AO1) - FMADD y10, a2, b8, y10 + FMADD y02, a2, b8, y02 LFD a2, 17 * SIZE(AO2) - FMADD y11, a3, b8, y11 + FMADD y03, a3, b8, y03 LFD a3, 17 * SIZE(AO3) - FMADD y12, a4, b8, y12 + FMADD y04, a4, b8, y04 LFD a4, 17 * SIZE(AO4) addi AO1, AO1, 16 * SIZE @@ -780,14 +772,14 @@ LL(12): addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE - FMADD y13, a5, b8, y13 + FMADD y05, a5, b8, y05 LFD a5, 17 * SIZE(AO5) - FMADD y14, a6, b8, y14 + FMADD y06, a6, b8, y06 LFD a6, 17 * SIZE(AO6) - FMADD y15, a7, b8, y15 + FMADD y07, a7, b8, y07 LFD a7, 17 * SIZE(AO7) - FMADD y16, a8, b8, y16 + FMADD y08, a8, b8, y08 LFD a8, 17 * SIZE(AO8) LFD b5, 21 * SIZE(BO) @@ -830,24 +822,24 @@ LL(13): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 3 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 3 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 3 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 3 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 3 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 3 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 3 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -870,24 +862,24 @@ LL(13): FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 5 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 5 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 5 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 5 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 5 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 5 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 5 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 5 * SIZE(AO8) LFD b1, 9 * SIZE(BO) @@ -915,24 +907,24 @@ LL(13): FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 7 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 7 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 7 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 7 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 7 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 7 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 7 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -955,24 +947,24 @@ LL(13): FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) - FMADD y09, a1, b8, y09 + FMADD y01, a1, b8, y01 LFD a1, 9 * SIZE(AO1) - FMADD y10, a2, b8, y10 + FMADD y02, a2, b8, y02 LFD a2, 9 * SIZE(AO2) - FMADD y11, a3, b8, y11 + FMADD y03, a3, b8, y03 LFD a3, 9 * SIZE(AO3) - FMADD y12, a4, b8, y12 + FMADD y04, a4, b8, y04 LFD a4, 9 * SIZE(AO4) - FMADD y13, a5, b8, y13 + FMADD y05, a5, b8, y05 LFD a5, 9 * SIZE(AO5) - FMADD y14, a6, b8, y14 + FMADD y06, a6, b8, y06 LFD a6, 9 * SIZE(AO6) - FMADD y15, a7, b8, y15 + FMADD y07, a7, b8, y07 LFD a7, 9 * SIZE(AO7) - FMADD y16, a8, b8, y16 + FMADD y08, a8, b8, y08 LFD a8, 9 * SIZE(AO8) LFD b5, 13 * SIZE(BO) @@ -1000,24 +992,24 @@ LL(13): FMADD y08, a8, b1, y08 LFD a8, 10 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 11 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 11 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 11 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 11 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 11 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 11 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 11 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 11 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -1040,24 +1032,24 @@ LL(13): FMADD y08, a8, b3, y08 LFD a8, 12 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 13 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 13 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 13 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 13 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 13 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 13 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 13 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 13 * SIZE(AO8) FMADD y01, a1, b5, y01 @@ -1080,24 +1072,24 @@ LL(13): FMADD y08, a8, b5, y08 LFD a8, 14 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 15 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 15 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 15 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 15 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 15 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 15 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 15 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 15 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -1120,20 +1112,20 @@ LL(13): FMADD y08, a8, b7, y08 LFD a8, 16 * SIZE(AO8) - FMADD y09, a1, b8, y09 - FMADD y10, a2, b8, y10 - FMADD y11, a3, b8, y11 - FMADD y12, a4, b8, y12 + FMADD y01, a1, b8, y01 + FMADD y02, a2, b8, y02 + FMADD y03, a3, b8, y03 + FMADD y04, a4, b8, y04 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE - FMADD y13, a5, b8, y13 - FMADD y14, a6, b8, y14 - FMADD y15, a7, b8, y15 - FMADD y16, a8, b8, y16 + FMADD y05, a5, b8, y05 + FMADD y06, a6, b8, y06 + FMADD y07, a7, b8, y07 + FMADD y08, a8, b8, y08 addi AO5, AO5, 16 * SIZE addi AO6, AO6, 16 * SIZE @@ -1180,21 +1172,21 @@ LL(14): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 3 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 3 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 3 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 3 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 3 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 3 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 3 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 3 * SIZE(AO8) LFD b5, 5 * SIZE(BO) @@ -1219,21 +1211,21 @@ LL(14): FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 LFD a1, 5 * SIZE(AO1) - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 LFD a2, 5 * SIZE(AO2) - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 LFD a3, 5 * SIZE(AO3) - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 LFD a4, 5 * SIZE(AO4) - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 LFD a5, 5 * SIZE(AO5) - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 LFD a6, 5 * SIZE(AO6) - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 LFD a7, 5 * SIZE(AO7) - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 LFD a8, 5 * SIZE(AO8) FMADD y01, a1, b5, y01 @@ -1253,21 +1245,21 @@ LL(14): FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) - FMADD y09, a1, b6, y09 + FMADD y01, a1, b6, y01 LFD a1, 7 * SIZE(AO1) - FMADD y10, a2, b6, y10 + FMADD y02, a2, b6, y02 LFD a2, 7 * SIZE(AO2) - FMADD y11, a3, b6, y11 + FMADD y03, a3, b6, y03 LFD a3, 7 * SIZE(AO3) - FMADD y12, a4, b6, y12 + FMADD y04, a4, b6, y04 LFD a4, 7 * SIZE(AO4) - FMADD y13, a5, b6, y13 + FMADD y05, a5, b6, y05 LFD a5, 7 * SIZE(AO5) - FMADD y14, a6, b6, y14 + FMADD y06, a6, b6, y06 LFD a6, 7 * SIZE(AO6) - FMADD y15, a7, b6, y15 + FMADD y07, a7, b6, y07 LFD a7, 7 * SIZE(AO7) - FMADD y16, a8, b6, y16 + FMADD y08, a8, b6, y08 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 @@ -1287,21 +1279,21 @@ LL(14): FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) - FMADD y09, a1, b8, y09 + FMADD y01, a1, b8, y01 addi AO1, AO1, 8 * SIZE - FMADD y10, a2, b8, y10 + FMADD y02, a2, b8, y02 addi AO2, AO2, 8 * SIZE - FMADD y11, a3, b8, y11 + FMADD y03, a3, b8, y03 addi AO3, AO3, 8 * SIZE - FMADD y12, a4, b8, y12 + FMADD y04, a4, b8, y04 addi AO4, AO4, 8 * SIZE - FMADD y13, a5, b8, y13 + FMADD y05, a5, b8, y05 addi AO5, AO5, 8 * SIZE - FMADD y14, a6, b8, y14 + FMADD y06, a6, b8, y06 addi AO6, AO6, 8 * SIZE - FMADD y15, a7, b8, y15 + FMADD y07, a7, b8, y07 addi AO7, AO7, 8 * SIZE - FMADD y16, a8, b8, y16 + FMADD y08, a8, b8, y08 addi AO8, AO8, 8 * SIZE addi BO, BO, 8 * SIZE .align 4 @@ -1341,21 +1333,21 @@ LL(15): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 LFD a1, 3 * SIZE(AO1) - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 LFD a2, 3 * SIZE(AO2) - FMADD y11, a3, b2, y11 + FMADD y03, a3, b2, y03 LFD a3, 3 * SIZE(AO3) - FMADD y12, a4, b2, y12 + FMADD y04, a4, b2, y04 LFD a4, 3 * SIZE(AO4) - FMADD y13, a5, b2, y13 + FMADD y05, a5, b2, y05 LFD a5, 3 * SIZE(AO5) - FMADD y14, a6, b2, y14 + FMADD y06, a6, b2, y06 LFD a6, 3 * SIZE(AO6) - FMADD y15, a7, b2, y15 + FMADD y07, a7, b2, y07 LFD a7, 3 * SIZE(AO7) - FMADD y16, a8, b2, y16 + FMADD y08, a8, b2, y08 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 @@ -1376,21 +1368,21 @@ LL(15): FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) - FMADD y09, a1, b4, y09 + FMADD y01, a1, b4, y01 addi AO1, AO1, 4 * SIZE - FMADD y10, a2, b4, y10 + FMADD y02, a2, b4, y02 addi AO2, AO2, 4 * SIZE - FMADD y11, a3, b4, y11 + FMADD y03, a3, b4, y03 addi AO3, AO3, 4 * SIZE - FMADD y12, a4, b4, y12 + FMADD y04, a4, b4, y04 addi AO4, AO4, 4 * SIZE - FMADD y13, a5, b4, y13 + FMADD y05, a5, b4, y05 addi AO5, AO5, 4 * SIZE - FMADD y14, a6, b4, y14 + FMADD y06, a6, b4, y06 addi AO6, AO6, 4 * SIZE - FMADD y15, a7, b4, y15 + FMADD y07, a7, b4, y07 addi AO7, AO7, 4 * SIZE - FMADD y16, a8, b4, y16 + FMADD y08, a8, b4, y08 addi AO8, AO8, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 @@ -1428,22 +1420,22 @@ LL(16): FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) - FMADD y09, a1, b2, y09 + FMADD y01, a1, b2, y01 addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE - FMADD y10, a2, b2, y10 + FMADD y02, a2, b2, y02 addi AO3, AO3, 2 * SIZE addi AO4, AO4, 2 * SIZE - FMADD y11, a3, b2, y11 - FMADD y12, a4, b2, y12 + FMADD y03, a3, b2, y03 + FMADD y04, a4, b2, y04 addi AO5, AO5, 2 * SIZE addi AO6, AO6, 2 * SIZE - FMADD y13, a5, b2, y13 - FMADD y14, a6, b2, y14 + FMADD y05, a5, b2, y05 + FMADD y06, a6, b2, y06 addi AO7, AO7, 2 * SIZE addi AO8, AO8, 2 * SIZE - FMADD y15, a7, b2, y15 - FMADD y16, a8, b2, y16 + FMADD y07, a7, b2, y07 + FMADD y08, a8, b2, y08 addi BO, BO, 2 * SIZE .align 4 @@ -1486,15 +1478,6 @@ LL(18): LFD a7, 7 * SIZE(CO) LFD a8, 8 * SIZE(CO) - FADD y01, y09, y01 - FADD y02, y10, y02 - FADD y03, y11, y03 - FADD y04, y12, y04 - FADD y05, y13, y05 - FADD y06, y14, y06 - FADD y07, y15, y07 - FADD y08, y16, y08 - FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 FMADD a3, alpha, y03, a3 @@ -1530,15 +1513,6 @@ LL(19): LFDUX a7, CO, INCY LFDUX a8, CO, INCY - FADD y01, y09, y01 - FADD y02, y10, y02 - FADD y03, y11, y03 - FADD y04, y12, y04 - FADD y05, y13, y05 - FADD y06, y14, y06 - FADD y07, y15, y07 - FADD y08, y16, y08 - FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 FMADD a3, alpha, f2, a3 @@ -1580,10 +1554,6 @@ LL(20): fmr y02, y01 fmr y03, y01 fmr y04, y01 - fmr y09, y01 - fmr y10, y01 - fmr y11, y01 - fmr y12, y01 DCBT(Y1, PREC) @@ -1621,13 +1591,13 @@ LL(22): FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 4 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 4 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 4 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1639,13 +1609,13 @@ LL(22): FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 6 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 6 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 6 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 6 * SIZE(AO4) LFD b1, 9 * SIZE(BO) @@ -1662,13 +1632,13 @@ LL(22): FMADD y04, a4, b5, y04 LFD a4, 7 * SIZE(AO4) - FMADD y09, a5, b6, y09 + FMADD y01, a5, b6, y01 LFD a5, 8 * SIZE(AO1) - FMADD y10, a6, b6, y10 + FMADD y02, a6, b6, y02 LFD a6, 8 * SIZE(AO2) - FMADD y11, a7, b6, y11 + FMADD y03, a7, b6, y03 LFD a7, 8 * SIZE(AO3) - FMADD y12, a8, b6, y12 + FMADD y04, a8, b6, y04 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b7, y01 @@ -1680,13 +1650,13 @@ LL(22): FMADD y04, a4, b7, y04 LFD a4, 9 * SIZE(AO4) - FMADD y09, a5, b8, y09 + FMADD y01, a5, b8, y01 LFD a5, 10 * SIZE(AO1) - FMADD y10, a6, b8, y10 + FMADD y02, a6, b8, y02 LFD a6, 10 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y03, a7, b8, y03 LFD a7, 10 * SIZE(AO3) - FMADD y12, a8, b8, y12 + FMADD y04, a8, b8, y04 LFD a8, 10 * SIZE(AO4) LFD b5, 13 * SIZE(BO) @@ -1703,13 +1673,13 @@ LL(22): FMADD y04, a4, b1, y04 LFD a4, 11 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 12 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 12 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 12 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 12 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1721,13 +1691,13 @@ LL(22): FMADD y04, a4, b3, y04 LFD a4, 13 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 14 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 14 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 14 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 14 * SIZE(AO4) LFD b1, 17 * SIZE(BO) @@ -1744,13 +1714,13 @@ LL(22): FMADD y04, a4, b5, y04 LFD a4, 15 * SIZE(AO4) - FMADD y09, a5, b6, y09 + FMADD y01, a5, b6, y01 LFD a5, 16 * SIZE(AO1) - FMADD y10, a6, b6, y10 + FMADD y02, a6, b6, y02 LFD a6, 16 * SIZE(AO2) - FMADD y11, a7, b6, y11 + FMADD y03, a7, b6, y03 LFD a7, 16 * SIZE(AO3) - FMADD y12, a8, b6, y12 + FMADD y04, a8, b6, y04 LFD a8, 16 * SIZE(AO4) FMADD y01, a1, b7, y01 @@ -1762,13 +1732,13 @@ LL(22): FMADD y04, a4, b7, y04 LFD a4, 17 * SIZE(AO4) - FMADD y09, a5, b8, y09 + FMADD y01, a5, b8, y01 LFD a5, 18 * SIZE(AO1) - FMADD y10, a6, b8, y10 + FMADD y02, a6, b8, y02 LFD a6, 18 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y03, a7, b8, y03 LFD a7, 18 * SIZE(AO3) - FMADD y12, a8, b8, y12 + FMADD y04, a8, b8, y04 LFD a8, 18 * SIZE(AO4) LFD b5, 21 * SIZE(BO) @@ -1800,13 +1770,13 @@ LL(23): FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 4 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 4 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 4 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1818,13 +1788,13 @@ LL(23): FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 6 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 6 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 6 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 6 * SIZE(AO4) LFD b1, 9 * SIZE(BO) @@ -1841,13 +1811,13 @@ LL(23): FMADD y04, a4, b5, y04 LFD a4, 7 * SIZE(AO4) - FMADD y09, a5, b6, y09 + FMADD y01, a5, b6, y01 LFD a5, 8 * SIZE(AO1) - FMADD y10, a6, b6, y10 + FMADD y02, a6, b6, y02 LFD a6, 8 * SIZE(AO2) - FMADD y11, a7, b6, y11 + FMADD y03, a7, b6, y03 LFD a7, 8 * SIZE(AO3) - FMADD y12, a8, b6, y12 + FMADD y04, a8, b6, y04 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b7, y01 @@ -1859,13 +1829,13 @@ LL(23): FMADD y04, a4, b7, y04 LFD a4, 9 * SIZE(AO4) - FMADD y09, a5, b8, y09 + FMADD y01, a5, b8, y01 LFD a5, 10 * SIZE(AO1) - FMADD y10, a6, b8, y10 + FMADD y02, a6, b8, y02 LFD a6, 10 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y03, a7, b8, y03 LFD a7, 10 * SIZE(AO3) - FMADD y12, a8, b8, y12 + FMADD y04, a8, b8, y04 LFD a8, 10 * SIZE(AO4) LFD b5, 13 * SIZE(BO) @@ -1882,13 +1852,13 @@ LL(23): FMADD y04, a4, b1, y04 LFD a4, 11 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 12 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 12 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 12 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 12 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1900,13 +1870,13 @@ LL(23): FMADD y04, a4, b3, y04 LFD a4, 13 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 14 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 14 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 14 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 14 * SIZE(AO4) FMADD y01, a1, b5, y01 @@ -1918,13 +1888,13 @@ LL(23): FMADD y04, a4, b5, y04 LFD a4, 15 * SIZE(AO4) - FMADD y09, a5, b6, y09 + FMADD y01, a5, b6, y01 LFD a5, 16 * SIZE(AO1) - FMADD y10, a6, b6, y10 + FMADD y02, a6, b6, y02 LFD a6, 16 * SIZE(AO2) - FMADD y11, a7, b6, y11 + FMADD y03, a7, b6, y03 LFD a7, 16 * SIZE(AO3) - FMADD y12, a8, b6, y12 + FMADD y04, a8, b6, y04 LFD a8, 16 * SIZE(AO4) FMADD y01, a1, b7, y01 @@ -1932,10 +1902,10 @@ LL(23): FMADD y03, a3, b7, y03 FMADD y04, a4, b7, y04 - FMADD y09, a5, b8, y09 - FMADD y10, a6, b8, y10 - FMADD y11, a7, b8, y11 - FMADD y12, a8, b8, y12 + FMADD y01, a5, b8, y01 + FMADD y02, a6, b8, y02 + FMADD y03, a7, b8, y03 + FMADD y04, a8, b8, y04 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE @@ -1975,13 +1945,13 @@ LL(24): FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 4 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 4 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 4 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -1993,13 +1963,13 @@ LL(24): FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 LFD a5, 6 * SIZE(AO1) - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 LFD a6, 6 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 LFD a7, 6 * SIZE(AO3) - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 LFD a8, 6 * SIZE(AO4) LFD b1, 5 * SIZE(BO) @@ -2016,13 +1986,13 @@ LL(24): FMADD y04, a4, b1, y04 LFD a4, 7 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 8 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 8 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 8 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -2030,13 +2000,13 @@ LL(24): FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 addi AO1, AO1, 8 * SIZE - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 addi AO2, AO2, 8 * SIZE - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 addi AO3, AO3, 8 * SIZE - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 addi AO4, AO4, 8 * SIZE addi BO, BO, 8 * SIZE @@ -2070,13 +2040,13 @@ LL(25): FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 LFD a5, 4 * SIZE(AO1) - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 LFD a6, 4 * SIZE(AO2) - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 LFD a7, 4 * SIZE(AO3) - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 @@ -2084,13 +2054,13 @@ LL(25): FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 - FMADD y09, a5, b4, y09 + FMADD y01, a5, b4, y01 addi AO1, AO1, 4 * SIZE - FMADD y10, a6, b4, y10 + FMADD y02, a6, b4, y02 addi AO2, AO2, 4 * SIZE - FMADD y11, a7, b4, y11 + FMADD y03, a7, b4, y03 addi AO3, AO3, 4 * SIZE - FMADD y12, a8, b4, y12 + FMADD y04, a8, b4, y04 addi AO4, AO4, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 @@ -2117,13 +2087,13 @@ LL(26): FMADD y03, a3, b1, y03 FMADD y04, a4, b1, y04 - FMADD y09, a5, b2, y09 + FMADD y01, a5, b2, y01 addi AO1, AO1, 2 * SIZE - FMADD y10, a6, b2, y10 + FMADD y02, a6, b2, y02 addi AO2, AO2, 2 * SIZE - FMADD y11, a7, b2, y11 + FMADD y03, a7, b2, y03 addi AO3, AO3, 2 * SIZE - FMADD y12, a8, b2, y12 + FMADD y04, a8, b2, y04 addi AO4, AO4, 2 * SIZE addi BO, BO, 2 * SIZE .align 4 @@ -2156,11 +2126,6 @@ LL(28): LFD a3, 3 * SIZE(CO) LFD a4, 4 * SIZE(CO) - FADD y01, y09, y01 - FADD y02, y10, y02 - FADD y03, y11, y03 - FADD y04, y12, y04 - FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 FMADD a3, alpha, y03, a3 @@ -2181,11 +2146,6 @@ LL(29): LFDUX a3, CO, INCY LFDUX a4, CO, INCY - FADD y01, y09, y01 - FADD y02, y10, y02 - FADD y03, y11, y03 - FADD y04, y12, y04 - FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 FMADD a3, alpha, f2, a3 @@ -2209,12 +2169,6 @@ LL(30): lfd y01, FZERO fmr y02, y01 - fmr y03, y01 - fmr y04, y01 - fmr y09, y01 - fmr y10, y01 - fmr y11, y01 - fmr y12, y01 DCBT(Y1, PREC) @@ -2247,18 +2201,18 @@ LL(32): LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) - FMADD y03, a3, b2, y03 + FMADD y01, a3, b2, y01 LFD a3, 6 * SIZE(AO1) - FMADD y04, a4, b2, y04 + FMADD y02, a4, b2, y02 LFD a4, 6 * SIZE(AO2) - FMADD y09, a5, b3, y09 + FMADD y01, a5, b3, y01 LFD a5, 7 * SIZE(AO1) - FMADD y10, a6, b3, y10 + FMADD y02, a6, b3, y02 LFD a6, 7 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y01, a7, b4, y01 LFD a7, 8 * SIZE(AO1) - FMADD y12, a8, b4, y12 + FMADD y02, a8, b4, y02 LFD a8, 8 * SIZE(AO2) LFD b1, 9 * SIZE(BO) @@ -2270,18 +2224,18 @@ LL(32): LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 9 * SIZE(AO2) - FMADD y03, a3, b6, y03 + FMADD y01, a3, b6, y01 LFD a3, 10 * SIZE(AO1) - FMADD y04, a4, b6, y04 + FMADD y02, a4, b6, y02 LFD a4, 10 * SIZE(AO2) - FMADD y09, a5, b7, y09 + FMADD y01, a5, b7, y01 LFD a5, 11 * SIZE(AO1) - FMADD y10, a6, b7, y10 + FMADD y02, a6, b7, y02 LFD a6, 11 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y01, a7, b8, y01 LFD a7, 12 * SIZE(AO1) - FMADD y12, a8, b8, y12 + FMADD y02, a8, b8, y02 LFD a8, 12 * SIZE(AO2) LFD b5, 13 * SIZE(BO) @@ -2293,18 +2247,18 @@ LL(32): LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 13 * SIZE(AO2) - FMADD y03, a3, b2, y03 + FMADD y01, a3, b2, y01 LFD a3, 14 * SIZE(AO1) - FMADD y04, a4, b2, y04 + FMADD y02, a4, b2, y02 LFD a4, 14 * SIZE(AO2) - FMADD y09, a5, b3, y09 + FMADD y01, a5, b3, y01 LFD a5, 15 * SIZE(AO1) - FMADD y10, a6, b3, y10 + FMADD y02, a6, b3, y02 LFD a6, 15 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y01, a7, b4, y01 LFD a7, 16 * SIZE(AO1) - FMADD y12, a8, b4, y12 + FMADD y02, a8, b4, y02 LFD a8, 16 * SIZE(AO2) LFD b1, 17 * SIZE(BO) @@ -2316,18 +2270,18 @@ LL(32): LFD a1, 17 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 17 * SIZE(AO2) - FMADD y03, a3, b6, y03 + FMADD y01, a3, b6, y01 LFD a3, 18 * SIZE(AO1) - FMADD y04, a4, b6, y04 + FMADD y02, a4, b6, y02 LFD a4, 18 * SIZE(AO2) - FMADD y09, a5, b7, y09 + FMADD y01, a5, b7, y01 LFD a5, 19 * SIZE(AO1) - FMADD y10, a6, b7, y10 + FMADD y02, a6, b7, y02 LFD a6, 19 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y01, a7, b8, y01 LFD a7, 20 * SIZE(AO1) - FMADD y12, a8, b8, y12 + FMADD y02, a8, b8, y02 LFD a8, 20 * SIZE(AO2) LFD b5, 21 * SIZE(BO) @@ -2349,18 +2303,18 @@ LL(33): LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) - FMADD y03, a3, b2, y03 + FMADD y01, a3, b2, y01 LFD a3, 6 * SIZE(AO1) - FMADD y04, a4, b2, y04 + FMADD y02, a4, b2, y02 LFD a4, 6 * SIZE(AO2) - FMADD y09, a5, b3, y09 + FMADD y01, a5, b3, y01 LFD a5, 7 * SIZE(AO1) - FMADD y10, a6, b3, y10 + FMADD y02, a6, b3, y02 LFD a6, 7 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y01, a7, b4, y01 LFD a7, 8 * SIZE(AO1) - FMADD y12, a8, b4, y12 + FMADD y02, a8, b4, y02 LFD a8, 8 * SIZE(AO2) LFD b1, 9 * SIZE(BO) @@ -2372,18 +2326,18 @@ LL(33): LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 9 * SIZE(AO2) - FMADD y03, a3, b6, y03 + FMADD y01, a3, b6, y01 LFD a3, 10 * SIZE(AO1) - FMADD y04, a4, b6, y04 + FMADD y02, a4, b6, y02 LFD a4, 10 * SIZE(AO2) - FMADD y09, a5, b7, y09 + FMADD y01, a5, b7, y01 LFD a5, 11 * SIZE(AO1) - FMADD y10, a6, b7, y10 + FMADD y02, a6, b7, y02 LFD a6, 11 * SIZE(AO2) - FMADD y11, a7, b8, y11 + FMADD y01, a7, b8, y01 LFD a7, 12 * SIZE(AO1) - FMADD y12, a8, b8, y12 + FMADD y02, a8, b8, y02 LFD a8, 12 * SIZE(AO2) LFD b5, 13 * SIZE(BO) @@ -2395,29 +2349,29 @@ LL(33): LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 13 * SIZE(AO2) - FMADD y03, a3, b2, y03 + FMADD y01, a3, b2, y01 LFD a3, 14 * SIZE(AO1) - FMADD y04, a4, b2, y04 + FMADD y02, a4, b2, y02 LFD a4, 14 * SIZE(AO2) - FMADD y09, a5, b3, y09 + FMADD y01, a5, b3, y01 LFD a5, 15 * SIZE(AO1) - FMADD y10, a6, b3, y10 + FMADD y02, a6, b3, y02 LFD a6, 15 * SIZE(AO2) - FMADD y11, a7, b4, y11 + FMADD y01, a7, b4, y01 LFD a7, 16 * SIZE(AO1) - FMADD y12, a8, b4, y12 + FMADD y02, a8, b4, y02 LFD a8, 16 * SIZE(AO2) FMADD y01, a1, b5, y01 FMADD y02, a2, b5, y02 - FMADD y03, a3, b6, y03 - FMADD y04, a4, b6, y04 + FMADD y01, a3, b6, y01 + FMADD y02, a4, b6, y02 - FMADD y09, a5, b7, y09 - FMADD y10, a6, b7, y10 - FMADD y11, a7, b8, y11 - FMADD y12, a8, b8, y12 + FMADD y01, a5, b7, y01 + FMADD y02, a6, b7, y02 + FMADD y01, a7, b8, y01 + FMADD y02, a8, b8, y02 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE @@ -2454,32 +2408,32 @@ LL(34): LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) - FMADD y09, a3, b2, y09 + FMADD y01, a3, b2, y01 LFD a3, 6 * SIZE(AO1) - FMADD y10, a4, b2, y10 + FMADD y02, a4, b2, y02 LFD a4, 6 * SIZE(AO2) FMADD y01, a5, b3, y01 LFD a5, 7 * SIZE(AO1) FMADD y02, a6, b3, y02 LFD a6, 7 * SIZE(AO2) - FMADD y09, a7, b4, y09 + FMADD y01, a7, b4, y01 LFD a7, 8 * SIZE(AO1) - FMADD y10, a8, b4, y10 + FMADD y02, a8, b4, y02 LFD a8, 8 * SIZE(AO2) FMADD y01, a1, b5, y01 FMADD y02, a2, b5, y02 - FMADD y09, a3, b6, y09 - FMADD y10, a4, b6, y10 + FMADD y01, a3, b6, y01 + FMADD y02, a4, b6, y02 FMADD y01, a5, b7, y01 addi AO1, AO1, 8 * SIZE FMADD y02, a6, b7, y02 addi AO2, AO2, 8 * SIZE - FMADD y09, a7, b8, y09 + FMADD y01, a7, b8, y01 addi BO, BO, 8 * SIZE - FMADD y10, a8, b8, y10 + FMADD y02, a8, b8, y02 nop .align 4 @@ -2504,17 +2458,17 @@ LL(35): FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 - FMADD y09, a3, b2, y09 - FMADD y10, a4, b2, y10 + FMADD y01, a3, b2, y01 + FMADD y02, a4, b2, y02 FMADD y01, a5, b3, y01 addi AO1, AO1, 4 * SIZE FMADD y02, a6, b3, y02 addi AO2, AO2, 4 * SIZE - FMADD y09, a7, b4, y09 + FMADD y01, a7, b4, y01 addi BO, BO, 4 * SIZE - FMADD y10, a8, b4, y10 + FMADD y02, a8, b4, y02 .align 4 LL(36): @@ -2531,8 +2485,8 @@ LL(36): FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 - FMADD y09, a3, b2, y09 - FMADD y10, a4, b2, y10 + FMADD y01, a3, b2, y01 + FMADD y02, a4, b2, y02 addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE @@ -2560,14 +2514,6 @@ LL(38): LFD a1, 1 * SIZE(CO) LFD a2, 2 * SIZE(CO) - FADD y01, y03, y01 - FADD y02, y04, y02 - FADD y09, y11, y09 - FADD y10, y12, y10 - - FADD y01, y09, y01 - FADD y02, y10, y02 - FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 @@ -2582,14 +2528,6 @@ LL(39): LFDUX a1, CO, INCY LFDUX a2, CO, INCY - FADD y01, y03, y01 - FADD y02, y04, y02 - FADD y09, y11, y09 - FADD y10, y12, y10 - - FADD y01, y09, y01 - FADD y02, y10, y02 - FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 @@ -2606,13 +2544,6 @@ LL(40): mr BO, XP lfd y01, FZERO - fmr y02, y01 - fmr y03, y01 - fmr y04, y01 - fmr y09, y01 - fmr y10, y01 - fmr y11, y01 - fmr y12, y01 DCBT(Y1, PREC) @@ -2646,17 +2577,17 @@ LL(42): LFD a1, 9 * SIZE(AO1) LFD b1, 9 * SIZE(BO) - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 nop LFD a2, 10 * SIZE(AO1) LFD b2, 10 * SIZE(BO) - FMADD y03, a3, b3, y03 + FMADD y01, a3, b3, y01 nop LFD a3, 11 * SIZE(AO1) LFD b3, 11 * SIZE(BO) - FMADD y04, a4, b4, y04 + FMADD y01, a4, b4, y01 nop LFD a4, 12 * SIZE(AO1) LFD b4, 12 * SIZE(BO) @@ -2666,17 +2597,17 @@ LL(42): LFD a5, 13 * SIZE(AO1) LFD b5, 13 * SIZE(BO) - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 nop LFD a6, 14 * SIZE(AO1) LFD b6, 14 * SIZE(BO) - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop LFD a7, 15 * SIZE(AO1) LFD b7, 15 * SIZE(BO) - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop LFD a8, 16 * SIZE(AO1) LFD b8, 16 * SIZE(BO) @@ -2686,17 +2617,17 @@ LL(42): LFD a1, 17 * SIZE(AO1) LFD b1, 17 * SIZE(BO) - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 nop LFD a2, 18 * SIZE(AO1) LFD b2, 18 * SIZE(BO) - FMADD y03, a3, b3, y03 + FMADD y01, a3, b3, y01 nop LFD a3, 19 * SIZE(AO1) LFD b3, 19 * SIZE(BO) - FMADD y04, a4, b4, y04 + FMADD y01, a4, b4, y01 nop LFD a4, 20 * SIZE(AO1) LFD b4, 20 * SIZE(BO) @@ -2706,17 +2637,17 @@ LL(42): LFD a5, 21 * SIZE(AO1) LFD b5, 21 * SIZE(BO) - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 nop LFD a6, 22 * SIZE(AO1) LFD b6, 22 * SIZE(BO) - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop LFD a7, 23 * SIZE(AO1) LFD b7, 23 * SIZE(BO) - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop LFD a8, 24 * SIZE(AO1) LFD b8, 24 * SIZE(BO) @@ -2733,17 +2664,17 @@ LL(43): LFD a1, 9 * SIZE(AO1) LFD b1, 9 * SIZE(BO) - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 nop LFD a2, 10 * SIZE(AO1) LFD b2, 10 * SIZE(BO) - FMADD y03, a3, b3, y03 + FMADD y01, a3, b3, y01 nop LFD a3, 11 * SIZE(AO1) LFD b3, 11 * SIZE(BO) - FMADD y04, a4, b4, y04 + FMADD y01, a4, b4, y01 nop LFD a4, 12 * SIZE(AO1) LFD b4, 12 * SIZE(BO) @@ -2753,34 +2684,34 @@ LL(43): LFD a5, 13 * SIZE(AO1) LFD b5, 13 * SIZE(BO) - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 nop LFD a6, 14 * SIZE(AO1) LFD b6, 14 * SIZE(BO) - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop LFD a7, 15 * SIZE(AO1) LFD b7, 15 * SIZE(BO) - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop LFD a8, 16 * SIZE(AO1) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 - FMADD y02, a2, b2, y02 - FMADD y03, a3, b3, y03 - FMADD y04, a4, b4, y04 + FMADD y01, a2, b2, y01 + FMADD y01, a3, b3, y01 + FMADD y01, a4, b4, y01 FMADD y01, a5, b5, y01 addi AO1, AO1, 16 * SIZE - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 addi BO, BO, 16 * SIZE - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop .align 4 @@ -2811,17 +2742,17 @@ LL(44): LFD b8, 8 * SIZE(BO) FMADD y01, a1, b1, y01 - FMADD y02, a2, b2, y02 - FMADD y03, a3, b3, y03 - FMADD y04, a4, b4, y04 + FMADD y01, a2, b2, y01 + FMADD y01, a3, b3, y01 + FMADD y01, a4, b4, y01 FMADD y01, a5, b5, y01 addi AO1, AO1, 8 * SIZE - FMADD y02, a6, b6, y02 + FMADD y01, a6, b6, y01 addi BO, BO, 8 * SIZE - FMADD y03, a7, b7, y03 + FMADD y01, a7, b7, y01 nop - FMADD y04, a8, b8, y04 + FMADD y01, a8, b8, y01 nop .align 4 @@ -2841,12 +2772,12 @@ LL(45): FMADD y01, a1, b1, y01 addi AO1, AO1, 4 * SIZE - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 addi AO2, AO2, 4 * SIZE - FMADD y03, a3, b3, y03 + FMADD y01, a3, b3, y01 addi BO, BO, 4 * SIZE - FMADD y04, a4, b4, y04 + FMADD y01, a4, b4, y01 nop .align 4 @@ -2861,7 +2792,7 @@ LL(46): FMADD y01, a1, b1, y01 addi AO1, AO1, 2 * SIZE - FMADD y02, a2, b2, y02 + FMADD y01, a2, b2, y01 addi BO, BO, 2 * SIZE .align 4 @@ -2882,10 +2813,6 @@ LL(48): LFD a1, 1 * SIZE(CO) - FADD y01, y02, y01 - FADD y03, y04, y03 - FADD y01, y03, y01 - FMADD a1, alpha, y01, a1 STFD a1, 1 * SIZE(CO) b LL(99) @@ -2893,9 +2820,7 @@ LL(48): LL(49): LFDUX a1, CO, INCY - FADD y01, y02, y01 - FADD y03, y04, y03 - FADD y01, y03, y01 + FMADD a1, alpha, f0, a1 STFDUX a1, BO, INCY .align 4 diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S index 9b47b9fc1..c513285df 100644 --- a/kernel/power/zgemm_kernel_power6.S +++ b/kernel/power/zgemm_kernel_power6.S @@ -1159,9 +1159,9 @@ LL(20): LL(22): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) @@ -1169,9 +1169,9 @@ LL(22): LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) @@ -1179,14 +1179,14 @@ LL(22): LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 - FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 - FMA3 f10, f17, f25, f10 + FMA4 f9, f17, f24, f9 + FMA3 f8, f17, f25, f8 FMA1 f12, f16, f26, f12 - FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 - FMA3 f14, f17, f27, f14 + FMA4 f13, f17, f26, f13 + FMA3 f12, f17, f27, f12 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) @@ -1194,14 +1194,14 @@ LL(22): LFD f27, 15 * SIZE(BO) FMA1 f0, f18, f20, f0 - FMA4 f3, f19, f20, f3 FMA2 f1, f18, f21, f1 - FMA3 f2, f19, f21, f2 + FMA4 f1, f19, f20, f1 + FMA3 f0, f19, f21, f0 FMA1 f4, f18, f22, f4 - FMA4 f7, f19, f22, f7 FMA2 f5, f18, f23, f5 - FMA3 f6, f19, f23, f6 + FMA4 f5, f19, f22, f5 + FMA3 f4, f19, f23, f4 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) @@ -1209,14 +1209,14 @@ LL(22): LFD f23, 19 * SIZE(BO) FMA1 f8, f18, f24, f8 - FMA4 f11, f19, f24, f11 FMA2 f9, f18, f25, f9 - FMA3 f10, f19, f25, f10 + FMA4 f9, f19, f24, f9 + FMA3 f8, f19, f25, f8 FMA1 f12, f18, f26, f12 - FMA4 f15, f19, f26, f15 FMA2 f13, f18, f27, f13 - FMA3 f14, f19, f27, f14 + FMA4 f13, f19, f26, f13 + FMA3 f12, f19, f27, f12 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) @@ -1224,9 +1224,9 @@ LL(22): LFD f27, 23 * SIZE(BO) FMA1 f0, f28, f20, f0 - FMA4 f3, f29, f20, f3 FMA2 f1, f28, f21, f1 - FMA3 f2, f29, f21, f2 + FMA4 f1, f29, f20, f1 + FMA3 f0, f29, f21, f0 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -1234,9 +1234,9 @@ LL(22): LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 - FMA4 f7, f29, f22, f7 FMA2 f5, f28, f23, f5 - FMA3 f6, f29, f23, f6 + FMA4 f5, f29, f22, f5 + FMA3 f4, f29, f23, f4 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) @@ -1244,14 +1244,14 @@ LL(22): LFD f23, 27 * SIZE(BO) FMA1 f8, f28, f24, f8 - FMA4 f11, f29, f24, f11 FMA2 f9, f28, f25, f9 - FMA3 f10, f29, f25, f10 + FMA4 f9, f29, f24, f9 + FMA3 f8, f29, f25, f8 FMA1 f12, f28, f26, f12 - FMA4 f15, f29, f26, f15 FMA2 f13, f28, f27, f13 - FMA3 f14, f29, f27, f14 + FMA4 f13, f29, f26, f13 + FMA3 f12, f29, f27, f12 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) @@ -1259,14 +1259,14 @@ LL(22): LFD f27, 31 * SIZE(BO) FMA1 f0, f30, f20, f0 - FMA4 f3, f31, f20, f3 FMA2 f1, f30, f21, f1 - FMA3 f2, f31, f21, f2 + FMA4 f1, f31, f20, f1 + FMA3 f0, f31, f21, f0 FMA1 f4, f30, f22, f4 - FMA4 f7, f31, f22, f7 FMA2 f5, f30, f23, f5 - FMA3 f6, f31, f23, f6 + FMA4 f5, f31, f22, f5 + FMA3 f4, f31, f23, f4 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) @@ -1274,14 +1274,14 @@ LL(22): LFD f23, 35 * SIZE(BO) FMA1 f8, f30, f24, f8 - FMA4 f11, f31, f24, f11 FMA2 f9, f30, f25, f9 - FMA3 f10, f31, f25, f10 + FMA4 f9, f31, f24, f9 + FMA3 f8, f31, f25, f8 FMA1 f12, f30, f26, f12 - FMA4 f15, f31, f26, f15 FMA2 f13, f30, f27, f13 - FMA3 f14, f31, f27, f14 + FMA4 f13, f31, f26, f13 + FMA3 f12, f31, f27, f12 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) @@ -1318,14 +1318,14 @@ LL(25): LL(26): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) @@ -1333,14 +1333,14 @@ LL(26): LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 - FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 - FMA3 f10, f17, f25, f10 + FMA4 f9, f17, f24, f9 + FMA3 f8, f17, f25, f8 FMA1 f12, f16, f26, f12 - FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 - FMA3 f14, f17, f27, f14 + FMA4 f13, f17, f26, f13 + FMA3 f12, f17, f27, f12 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) @@ -1363,47 +1363,42 @@ LL(28): LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) - FADD f0, f0, f2 - FADD f1, f1, f3 - FADD f4, f4, f6 - FADD f5, f5, f7 - LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) - FADD f8, f8, f10 - FADD f9, f9, f11 - FADD f12, f12, f14 - FADD f13, f13, f15 + fmr f2, f0 + fmr f3, f1 + fmr f6, f4 + fmr f7, f5 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 - FNMSUB f26, f31, f5, f18 - FMADD f27, f31, f4, f19 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 + FMADD f26, f30, f4, f18 + FMADD f27, f30, f5, f19 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 - FMADD f4, f30, f4, f26 - FMADD f5, f30, f5, f27 + FNMSUB f0, f31, f3, f24 + FMADD f1, f31, f2, f25 + FNMSUB f4, f31, f7, f26 + FMADD f5, f31, f6, f27 - FNMSUB f24, f31, f9, f20 - FMADD f25, f31, f8, f21 - FNMSUB f26, f31, f13, f22 - FMADD f27, f31, f12, f23 + fmr f10, f8 + fmr f11, f9 + fmr f14, f12 + fmr f15, f13 - FMADD f8, f30, f8, f24 - FMADD f9, f30, f9, f25 - FMADD f12, f30, f12, f26 - FMADD f13, f30, f13, f27 + FMADD f24, f30, f8, f20 + FMADD f25, f30, f9, f21 + FMADD f26, f30, f12, f22 + FMADD f27, f30, f13, f23 + + FNMSUB f8, f31, f11, f24 + FMADD f9, f31, f10, f25 + FNMSUB f12, f31, f15, f26 + FMADD f13, f31, f14, f27 #else - FADD f0, f0, f2 - FADD f1, f1, f3 - FADD f4, f4, f6 - FADD f5, f5, f7 - FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f5 @@ -1414,11 +1409,6 @@ LL(28): FMSUB f4, f30, f4, f18 FMADD f5, f30, f5, f19 - FADD f8, f8, f10 - FADD f9, f9, f11 - FADD f12, f12, f14 - FADD f13, f13, f15 - FMUL f20, f31, f9 FMUL f21, f31, f8 FMUL f22, f31, f13 @@ -1616,15 +1606,15 @@ LL(32): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) @@ -1646,15 +1636,15 @@ LL(32): FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 - FMA4 f9, f29, f24, f9 - FMA4 f11, f31, f24, f11 - FMA3 f8, f29, f25, f8 - FMA3 f10, f31, f25, f10 + FMA4 f1, f29, f24, f1 + FMA4 f3, f31, f24, f3 + FMA3 f0, f29, f25, f0 + FMA3 f2, f31, f25, f2 - FMA4 f13, f29, f26, f13 - FMA4 f15, f31, f26, f15 - FMA3 f12, f29, f27, f12 - FMA3 f14, f31, f27, f14 + FMA4 f5, f29, f26, f5 + FMA4 f7, f31, f26, f7 + FMA3 f4, f29, f27, f4 + FMA3 f6, f31, f27, f6 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) @@ -1676,15 +1666,15 @@ LL(32): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) @@ -1706,15 +1696,15 @@ LL(32): FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 - FMA4 f9, f29, f24, f9 - FMA4 f11, f31, f24, f11 - FMA3 f8, f29, f25, f8 - FMA3 f10, f31, f25, f10 + FMA4 f1, f29, f24, f1 + FMA4 f3, f31, f24, f3 + FMA3 f0, f29, f25, f0 + FMA3 f2, f31, f25, f2 - FMA4 f13, f29, f26, f13 - FMA4 f15, f31, f26, f15 - FMA3 f12, f29, f27, f12 - FMA3 f14, f31, f27, f14 + FMA4 f5, f29, f26, f5 + FMA4 f7, f31, f26, f7 + FMA3 f4, f29, f27, f4 + FMA3 f6, f31, f27, f6 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) @@ -1736,15 +1726,15 @@ LL(32): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) @@ -1766,15 +1756,15 @@ LL(32): FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 - FMA4 f9, f29, f24, f9 - FMA4 f11, f31, f24, f11 - FMA3 f8, f29, f25, f8 - FMA3 f10, f31, f25, f10 + FMA4 f1, f29, f24, f1 + FMA4 f3, f31, f24, f3 + FMA3 f0, f29, f25, f0 + FMA3 f2, f31, f25, f2 - FMA4 f13, f29, f26, f13 - FMA4 f15, f31, f26, f15 - FMA3 f12, f29, f27, f12 - FMA3 f14, f31, f27, f14 + FMA4 f5, f29, f26, f5 + FMA4 f7, f31, f26, f7 + FMA3 f4, f29, f27, f4 + FMA3 f6, f31, f27, f6 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) @@ -1796,15 +1786,15 @@ LL(32): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) @@ -1826,15 +1816,15 @@ LL(32): FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 - FMA4 f9, f29, f24, f9 - FMA4 f11, f31, f24, f11 - FMA3 f8, f29, f25, f8 - FMA3 f10, f31, f25, f10 + FMA4 f1, f29, f24, f1 + FMA4 f3, f31, f24, f3 + FMA3 f0, f29, f25, f0 + FMA3 f2, f31, f25, f2 - FMA4 f13, f29, f26, f13 - FMA4 f15, f31, f26, f15 - FMA3 f12, f29, f27, f12 - FMA3 f14, f31, f27, f14 + FMA4 f5, f29, f26, f5 + FMA4 f7, f31, f26, f7 + FMA3 f4, f29, f27, f4 + FMA3 f6, f31, f27, f6 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) @@ -1883,20 +1873,20 @@ LL(36): FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 4 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) - FMA4 f13, f17, f22, f13 - FMA4 f15, f19, f22, f15 - FMA3 f12, f17, f23, f12 - FMA3 f14, f19, f23, f14 + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 LFD f17, 5 * SIZE(AO) LFD f19, 7 * SIZE(AO) @@ -1916,52 +1906,42 @@ LL(38): LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) - FADD f0, f0, f8 - FADD f1, f1, f9 - FADD f2, f2, f10 - FADD f3, f3, f11 - LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) - FADD f4, f4, f12 - FADD f5, f5, f13 - FADD f6, f6, f14 - FADD f7, f7, f15 + fmr f8, f0 + fmr f9, f1 + fmr f10, f2 + fmr f11, f3 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 - FNMSUB f26, f31, f3, f18 - FMADD f27, f31, f2, f19 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 + FMADD f26, f30, f2, f18 + FMADD f27, f30, f3, f19 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 - FMADD f2, f30, f2, f26 - FMADD f3, f30, f3, f27 + FNMSUB f0, f31, f9, f24 + FMADD f1, f31, f8, f25 + FNMSUB f2, f31, f11, f26 + FMADD f3, f31, f10, f27 - FNMSUB f24, f31, f5, f20 - FMADD f25, f31, f4, f21 - FNMSUB f26, f31, f7, f22 - FMADD f27, f31, f6, f23 + fmr f12, f4 + fmr f13, f5 + fmr f14, f6 + fmr f15, f7 - FMADD f4, f30, f4, f24 - FMADD f5, f30, f5, f25 - FMADD f6, f30, f6, f26 - FMADD f7, f30, f7, f27 + FMADD f24, f30, f4, f20 + FMADD f25, f30, f5, f21 + FMADD f26, f30, f6, f22 + FMADD f27, f30, f7, f23 + + FNMSUB f4, f31, f13, f24 + FMADD f5, f31, f12, f25 + FNMSUB f6, f31, f15, f26 + FMADD f7, f31, f14, f27 #else - FADD f0, f0, f8 - FADD f1, f1, f9 - FADD f2, f2, f10 - FADD f3, f3, f11 - - FADD f4, f4, f12 - FADD f5, f5, f13 - FADD f6, f6, f14 - FADD f7, f7, f15 - FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 @@ -2101,14 +2081,14 @@ LL(40): LL(42): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) @@ -2119,14 +2099,14 @@ LL(42): LFD f23, 7 * SIZE(BO) FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2137,14 +2117,14 @@ LL(42): LFD f23, 11 * SIZE(BO) FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) @@ -2155,14 +2135,14 @@ LL(42): LFD f23, 15 * SIZE(BO) FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2202,14 +2182,14 @@ LL(45): LL(46): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 FMA1 f4, f16, f22, f4 - FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 - FMA3 f6, f17, f23, f6 + FMA4 f5, f17, f22, f5 + FMA3 f4, f17, f23, f4 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) @@ -2231,27 +2211,22 @@ LL(48): LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) - FADD f0, f0, f2 - FADD f1, f1, f3 - FADD f4, f4, f6 - FADD f5, f5, f7 + fmr f2, f0 + fmr f3, f1 + fmr f6, f4 + fmr f7, f5 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 - FNMSUB f26, f31, f5, f20 - FMADD f27, f31, f4, f21 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 + FMADD f26, f30, f4, f20 + FMADD f27, f30, f5, f21 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 - FMADD f4, f30, f4, f26 - FMADD f5, f30, f5, f27 + FNMSUB f0, f31, f3, f24 + FMADD f1, f31, f2, f25 + FNMSUB f4, f31, f7, f26 + FMADD f5, f31, f6, f27 #else - FADD f0, f0, f2 - FADD f1, f1, f3 - FADD f4, f4, f6 - FADD f5, f5, f7 - FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f5 @@ -2401,10 +2376,10 @@ LL(52): FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2416,10 +2391,10 @@ LL(52): FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 - FMA4 f9, f17, f22, f9 - FMA4 f11, f19, f22, f11 - FMA3 f8, f17, f23, f8 - FMA3 f10, f19, f23, f10 + FMA4 f1, f17, f22, f1 + FMA4 f3, f19, f22, f3 + FMA3 f0, f17, f23, f0 + FMA3 f2, f19, f23, f2 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2436,10 +2411,10 @@ LL(52): FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) @@ -2451,10 +2426,10 @@ LL(52): FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 - FMA4 f9, f17, f22, f9 - FMA4 f11, f19, f22, f11 - FMA3 f8, f17, f23, f8 - FMA3 f10, f19, f23, f10 + FMA4 f1, f17, f22, f1 + FMA4 f3, f19, f22, f3 + FMA3 f0, f17, f23, f0 + FMA3 f2, f19, f23, f2 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) @@ -2471,10 +2446,10 @@ LL(52): FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 20 * SIZE(AO) LFD f17, 21 * SIZE(AO) @@ -2486,10 +2461,10 @@ LL(52): FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 - FMA4 f9, f17, f22, f9 - FMA4 f11, f19, f22, f11 - FMA3 f8, f17, f23, f8 - FMA3 f10, f19, f23, f10 + FMA4 f1, f17, f22, f1 + FMA4 f3, f19, f22, f3 + FMA3 f0, f17, f23, f0 + FMA3 f2, f19, f23, f2 LFD f16, 24 * SIZE(AO) LFD f17, 25 * SIZE(AO) @@ -2506,10 +2481,10 @@ LL(52): FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f16, 28 * SIZE(AO) LFD f17, 29 * SIZE(AO) @@ -2521,10 +2496,10 @@ LL(52): FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 - FMA4 f9, f17, f22, f9 - FMA4 f11, f19, f22, f11 - FMA3 f8, f17, f23, f8 - FMA3 f10, f19, f23, f10 + FMA4 f1, f17, f22, f1 + FMA4 f3, f19, f22, f3 + FMA3 f0, f17, f23, f0 + FMA3 f2, f19, f23, f2 LFD f16, 32 * SIZE(AO) LFD f17, 33 * SIZE(AO) @@ -2573,10 +2548,10 @@ LL(56): LFD f16, 4 * SIZE(AO) LFD f18, 6 * SIZE(AO) - FMA4 f9, f17, f20, f9 - FMA4 f11, f19, f20, f11 - FMA3 f8, f17, f21, f8 - FMA3 f10, f19, f21, f10 + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 LFD f17, 5 * SIZE(AO) LFD f19, 7 * SIZE(AO) @@ -2595,27 +2570,22 @@ LL(58): LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) - FADD f0, f0, f8 - FADD f1, f1, f9 - FADD f2, f2, f10 - FADD f3, f3, f11 + fmr f8, f0 + fmr f9, f1 + fmr f10, f2 + fmr f11, f3 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 - FNMSUB f26, f31, f3, f18 - FMADD f27, f31, f2, f19 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 + FMADD f26, f30, f2, f18 + FMADD f27, f30, f3, f19 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 - FMADD f2, f30, f2, f26 - FMADD f3, f30, f3, f27 + FNMSUB f0, f31, f9, f24 + FMADD f1, f31, f8, f25 + FNMSUB f2, f31, f11, f26 + FMADD f3, f31, f10, f27 #else - FADD f0, f0, f8 - FADD f1, f1, f9 - FADD f2, f2, f10 - FADD f3, f3, f11 - FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 @@ -2735,9 +2705,9 @@ LL(60): LL(62): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) @@ -2745,9 +2715,9 @@ LL(62): LFD f21, 5 * SIZE(BO) FMA1 f0, f18, f22, f0 - FMA4 f3, f19, f22, f3 FMA2 f1, f18, f23, f1 - FMA3 f2, f19, f23, f2 + FMA4 f1, f19, f22, f1 + FMA3 f0, f19, f23, f0 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) @@ -2755,9 +2725,9 @@ LL(62): LFD f23, 7 * SIZE(BO) FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + FMA3 f0, f17, f21, f0 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) @@ -2765,9 +2735,9 @@ LL(62): LFD f21, 9 * SIZE(BO) FMA1 f0, f18, f22, f0 - FMA4 f3, f19, f22, f3 FMA2 f1, f18, f23, f1 - FMA3 f2, f19, f23, f2 + FMA4 f1, f19, f22, f1 + FMA3 f0, f19, f23, f0 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) @@ -2803,11 +2773,11 @@ LL(65): LL(66): FMA1 f0, f16, f20, f0 - FMA4 f3, f17, f20, f3 - LFD f20, 2 * SIZE(BO) FMA2 f1, f16, f21, f1 LFD f16, 2 * SIZE(AO) - FMA3 f2, f17, f21, f2 + FMA4 f1, f17, f20, f1 + LFD f20, 2 * SIZE(BO) + FMA3 f0, f17, f21, f0 LFD f17, 3 * SIZE(AO) LFD f21, 3 * SIZE(BO) @@ -2821,20 +2791,17 @@ LL(68): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) - FADD f0, f0, f2 - FADD f1, f1, f3 + fmr f2, f0 + fmr f3, f1 - FNMSUB f24, f31, f1, f16 - FMADD f25, f31, f0, f17 + FMADD f24, f30, f0, f16 + FMADD f25, f30, f1, f17 - FMADD f0, f30, f0, f24 - FMADD f1, f30, f1, f25 + FNMSUB f0, f31, f3, f24 + FMADD f1, f31, f2, f25 #else - FADD f0, f0, f2 - FADD f1, f1, f3 - FMUL f16, f31, f1 FMUL f17, f31, f0