592 lines
		
	
	
		
			35 KiB
		
	
	
	
		
			C
		
	
	
	
			
		
		
	
	
			592 lines
		
	
	
		
			35 KiB
		
	
	
	
		
			C
		
	
	
	
/*******************************************************************************
 | 
						|
Copyright (c) 2016, The OpenBLAS Project
 | 
						|
All rights reserved.
 | 
						|
Redistribution and use in source and binary forms, with or without
 | 
						|
modification, are permitted provided that the following conditions are
 | 
						|
met:
 | 
						|
1. Redistributions of source code must retain the above copyright
 | 
						|
notice, this list of conditions and the following disclaimer.
 | 
						|
2. Redistributions in binary form must reproduce the above copyright
 | 
						|
notice, this list of conditions and the following disclaimer in
 | 
						|
the documentation and/or other materials provided with the
 | 
						|
distribution.
 | 
						|
3. Neither the name of the OpenBLAS project nor the names of
 | 
						|
its contributors may be used to endorse or promote products
 | 
						|
derived from this software without specific prior written permission.
 | 
						|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
						|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
						|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
						|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
						|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
						|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
						|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
						|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
						|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
						|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
						|
*******************************************************************************/
 | 
						|
 | 
						|
#include "common.h"
 | 
						|
#include "macros_msa.h"
 | 
						|
 | 
						|
#undef OP0
 | 
						|
#undef OP1
 | 
						|
#undef OP2
 | 
						|
#undef OP3
 | 
						|
#undef OP4
 | 
						|
 | 
						|
#if !defined(CONJ)
 | 
						|
    #if !defined(XCONJ)
 | 
						|
        #define OP0  -=
 | 
						|
        #define OP1  +=
 | 
						|
        #define OP2  +=
 | 
						|
    #else
 | 
						|
        #define OP0  +=
 | 
						|
        #define OP1  +=
 | 
						|
        #define OP2  -=
 | 
						|
    #endif
 | 
						|
#else
 | 
						|
    #if !defined(XCONJ)
 | 
						|
        #define OP0  +=
 | 
						|
        #define OP1  -=
 | 
						|
        #define OP2  +=
 | 
						|
    #else
 | 
						|
        #define OP0  -=
 | 
						|
        #define OP1  -=
 | 
						|
        #define OP2  -=
 | 
						|
    #endif
 | 
						|
#endif
 | 
						|
 | 
						|
#define ZGEMV_T_8x1()                     \
 | 
						|
    LD_DP4(pa0, 2, t0, t1, t2, t3);       \
 | 
						|
    LD_DP4(pa0 + 8, 2, t4, t5, t6, t7);   \
 | 
						|
                                          \
 | 
						|
    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
 | 
						|
    PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
 | 
						|
    PCKEVOD_D2_DP(t5, t4, src2r, src2i);  \
 | 
						|
    PCKEVOD_D2_DP(t7, t6, src3r, src3i);  \
 | 
						|
                                          \
 | 
						|
    tp0r += src0r * x0r;                  \
 | 
						|
    tp0i OP1 src0r * x0i;                 \
 | 
						|
    tp0r OP0 src0i * x0i;                 \
 | 
						|
    tp0i OP2 src0i * x0r;                 \
 | 
						|
                                          \
 | 
						|
    tp0r += src2r * x2r;                  \
 | 
						|
    tp0i OP1 src2r * x2i;                 \
 | 
						|
    tp0r OP0 src2i * x2i;                 \
 | 
						|
    tp0i OP2 src2i * x2r;                 \
 | 
						|
                                          \
 | 
						|
    tp0r += src1r * x1r;                  \
 | 
						|
    tp0i OP1 src1r * x1i;                 \
 | 
						|
    tp0r OP0 src1i * x1i;                 \
 | 
						|
    tp0i OP2 src1i * x1r;                 \
 | 
						|
                                          \
 | 
						|
    tp0r += src3r * x3r;                  \
 | 
						|
    tp0i OP1 src3r * x3i;                 \
 | 
						|
    tp0r OP0 src3i * x3i;                 \
 | 
						|
    tp0i OP2 src3i * x3r;                 \
 | 
						|
 | 
						|
#define ZGEMV_T_4x1()                     \
 | 
						|
    LD_DP4(pa0, 2, t0, t1, t2, t3);       \
 | 
						|
                                          \
 | 
						|
    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
 | 
						|
    PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
 | 
						|
                                          \
 | 
						|
    tp0r += src0r * x0r;                  \
 | 
						|
    tp0r += src1r * x1r;                  \
 | 
						|
    tp0r OP0 src0i * x0i;                 \
 | 
						|
    tp0r OP0 src1i * x1i;                 \
 | 
						|
                                          \
 | 
						|
    tp0i OP1 src0r * x0i;                 \
 | 
						|
    tp0i OP1 src1r * x1i;                 \
 | 
						|
    tp0i OP2 src0i * x0r;                 \
 | 
						|
    tp0i OP2 src1i * x1r;                 \
 | 
						|
 | 
						|
#define ZGEMV_T_2x1()                     \
 | 
						|
    LD_DP2(pa0, 2, t0, t1);               \
 | 
						|
                                          \
 | 
						|
    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
 | 
						|
                                          \
 | 
						|
    tp0r += src0r * x0r;                  \
 | 
						|
    tp0r OP0 src0i * x0i;                 \
 | 
						|
                                          \
 | 
						|
    tp0i OP1 src0r * x0i;                 \
 | 
						|
    tp0i OP2 src0i * x0r;                 \
 | 
						|
 | 
						|
#define ZGEMV_T_1x1()                       \
 | 
						|
    temp0r  += pa0[0] * x[0 * inc_x2];      \
 | 
						|
    temp0r OP0 pa0[1] * x[0 * inc_x2 + 1];  \
 | 
						|
                                            \
 | 
						|
    temp0i OP1 pa0[0] * x[0 * inc_x2 + 1];  \
 | 
						|
    temp0i OP2 pa0[1] * x[0 * inc_x2];      \
 | 
						|
 | 
						|
#define ZSCALE_STORE_Y1_GP()    \
 | 
						|
    res0r = y[0 * inc_y2];      \
 | 
						|
    res0i = y[0 * inc_y2 + 1];  \
 | 
						|
                                \
 | 
						|
    res0r  += alphar * temp0r;  \
 | 
						|
    res0r OP0 alphai * temp0i;  \
 | 
						|
                                \
 | 
						|
    res0i OP1 alphar * temp0i;  \
 | 
						|
    res0i OP2 alphai * temp0r;  \
 | 
						|
                                \
 | 
						|
    y[0 * inc_y2] = res0r;      \
 | 
						|
    y[0 * inc_y2 + 1] = res0i;  \
 | 
						|
 | 
						|
#define ZLOAD_X8_VECTOR()             \
 | 
						|
    LD_DP4(x, 2, x0, x1, x2, x3);     \
 | 
						|
    LD_DP4(x + 8, 2, x4, x5, x6, x7); \
 | 
						|
                                      \
 | 
						|
    PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
 | 
						|
    PCKEVOD_D2_DP(x3, x2, x1r, x1i);  \
 | 
						|
    PCKEVOD_D2_DP(x5, x4, x2r, x2i);  \
 | 
						|
    PCKEVOD_D2_DP(x7, x6, x3r, x3i);  \
 | 
						|
 | 
						|
#define ZLOAD_X4_VECTOR()             \
 | 
						|
    LD_DP4(x, 2, x0, x1, x2, x3);     \
 | 
						|
    PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
 | 
						|
    PCKEVOD_D2_DP(x3, x2, x1r, x1i);  \
 | 
						|
 | 
						|
#define ZLOAD_X2_VECTOR()             \
 | 
						|
    LD_DP2(x, 2, x0, x1);             \
 | 
						|
    PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
 | 
						|
 | 
						|
#define ZLOAD_X8_GP()                                                                      \
 | 
						|
    x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2)));      \
 | 
						|
    x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *) (x + 1 * inc_x2)));      \
 | 
						|
    x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2)));      \
 | 
						|
    x1r = (v2f64) __msa_insert_d((v2i64) x1r,  1, *((long long *) (x + 3 * inc_x2)));      \
 | 
						|
    x2r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 4 * inc_x2)));      \
 | 
						|
    x2r = (v2f64) __msa_insert_d((v2i64) x2r,  1, *((long long *) (x + 5 * inc_x2)));      \
 | 
						|
    x3r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 6 * inc_x2)));      \
 | 
						|
    x3r = (v2f64) __msa_insert_d((v2i64) x3r,  1, *((long long *) (x + 7 * inc_x2)));      \
 | 
						|
    x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1)));  \
 | 
						|
    x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *) (x + 1 * inc_x2 + 1)));  \
 | 
						|
    x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2 + 1)));  \
 | 
						|
    x1i = (v2f64) __msa_insert_d((v2i64) x1i,  1, *((long long *) (x + 3 * inc_x2 + 1)));  \
 | 
						|
    x2i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 4 * inc_x2 + 1)));  \
 | 
						|
    x2i = (v2f64) __msa_insert_d((v2i64) x2i,  1, *((long long *) (x + 5 * inc_x2 + 1)));  \
 | 
						|
    x3i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 6 * inc_x2 + 1)));  \
 | 
						|
    x3i = (v2f64) __msa_insert_d((v2i64) x3i,  1, *((long long *) (x + 7 * inc_x2 + 1)));  \
 | 
						|
 | 
						|
#define ZLOAD_X4_GP()                                                                      \
 | 
						|
    x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2)));      \
 | 
						|
    x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *) (x + 1 * inc_x2)));      \
 | 
						|
    x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2)));      \
 | 
						|
    x1r = (v2f64) __msa_insert_d((v2i64) x1r,  1, *((long long *) (x + 3 * inc_x2)));      \
 | 
						|
    x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1)));  \
 | 
						|
    x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *) (x + 1 * inc_x2 + 1)));  \
 | 
						|
    x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2 + 1)));  \
 | 
						|
    x1i = (v2f64) __msa_insert_d((v2i64) x1i,  1, *((long long *) (x + 3 * inc_x2 + 1)));  \
 | 
						|
 | 
						|
#define ZLOAD_X2_GP()                                                                      \
 | 
						|
    x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2)));      \
 | 
						|
    x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *) (x + 1 * inc_x2)));      \
 | 
						|
    x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1)));  \
 | 
						|
    x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *) (x + 1 * inc_x2 + 1)));  \
 | 
						|
 | 
						|
#define ZGEMV_T_MSA()                                                   \
 | 
						|
    for (j = n; j--;)                                                   \
 | 
						|
    {                                                                   \
 | 
						|
        tp0r = zero;                                                    \
 | 
						|
        tp0i = zero;                                                    \
 | 
						|
        tp1r = zero;                                                    \
 | 
						|
        tp1i = zero;                                                    \
 | 
						|
        tp2r = zero;                                                    \
 | 
						|
        tp2i = zero;                                                    \
 | 
						|
        tp3r = zero;                                                    \
 | 
						|
        tp3i = zero;                                                    \
 | 
						|
                                                                        \
 | 
						|
        pa0 = A;                                                        \
 | 
						|
        x = srcx_org;                                                   \
 | 
						|
                                                                        \
 | 
						|
        if (m >> 4)                                                     \
 | 
						|
        {                                                               \
 | 
						|
            x0 = LD_DP(x);                                              \
 | 
						|
            x1 = LD_DP(x + 1 * inc_x2);                                 \
 | 
						|
            t0 = LD_DP(pa0);                                            \
 | 
						|
            t1 = LD_DP(pa0 + 2);                                        \
 | 
						|
                                                                        \
 | 
						|
            x4 = LD_DP(x + 4 * inc_x2);                                 \
 | 
						|
            x5 = LD_DP(x + 5 * inc_x2);                                 \
 | 
						|
            t4 = LD_DP(pa0 + 8);                                        \
 | 
						|
            t5 = LD_DP(pa0 + 10);                                       \
 | 
						|
                                                                        \
 | 
						|
            for (i = (m >> 4) - 1; i--;)                                \
 | 
						|
            {                                                           \
 | 
						|
                pa0_pref = pa0 + pref_offset;                           \
 | 
						|
                                                                        \
 | 
						|
                PREFETCH(pa0_pref + 36);                                \
 | 
						|
                PREFETCH(pa0_pref + 44);                                \
 | 
						|
                PREFETCH(pa0_pref + 48);                                \
 | 
						|
                PREFETCH(pa0_pref + 52);                                \
 | 
						|
                PREFETCH(pa0_pref + 56);                                \
 | 
						|
                PREFETCH(pa0_pref + 60);                                \
 | 
						|
                PREFETCH(pa0_pref + 64);                                \
 | 
						|
                PREFETCH(pa0_pref + 72);                                \
 | 
						|
                                                                        \
 | 
						|
                x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0);    \
 | 
						|
                x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0);    \
 | 
						|
                src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0);  \
 | 
						|
                src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0);  \
 | 
						|
                                                                        \
 | 
						|
                tp0r += src0r * x0r;                                    \
 | 
						|
                x2 = LD_DP(x + 2 * inc_x2);                             \
 | 
						|
                x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);    \
 | 
						|
                                                                        \
 | 
						|
                tp0i OP1 src0r * x0i;                                   \
 | 
						|
                x3 = LD_DP(x + 3 * inc_x2);                             \
 | 
						|
                x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4);    \
 | 
						|
                                                                        \
 | 
						|
                tp1r OP0 src0i * x0i;                                   \
 | 
						|
                t2 = LD_DP(pa0 + 4);                                    \
 | 
						|
                src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4);  \
 | 
						|
                                                                        \
 | 
						|
                tp1i OP2 src0i * x0r;                                   \
 | 
						|
                t3 = LD_DP(pa0 + 6);                                    \
 | 
						|
                src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4);  \
 | 
						|
                                                                        \
 | 
						|
                tp2r += src2r * x2r;                                    \
 | 
						|
                x6 = LD_DP(x + 6 * inc_x2);                             \
 | 
						|
                                                                        \
 | 
						|
                tp2i OP1 src2r * x2i;                                   \
 | 
						|
                x7 = LD_DP(x + 7 * inc_x2);                             \
 | 
						|
                                                                        \
 | 
						|
                tp3r OP0 src2i * x2i;                                   \
 | 
						|
                t6 = LD_DP(pa0 + 12);                                   \
 | 
						|
                                                                        \
 | 
						|
                tp3i OP2 src2i * x2r;                                   \
 | 
						|
                t7 = LD_DP(pa0 + 14);                                   \
 | 
						|
                                                                        \
 | 
						|
                x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2);    \
 | 
						|
                x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2);    \
 | 
						|
                src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2);  \
 | 
						|
                src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2);  \
 | 
						|
                                                                        \
 | 
						|
                tp0r += src1r * x1r;                                    \
 | 
						|
                x0 = LD_DP(x +  8 * inc_x2);                            \
 | 
						|
                x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6);    \
 | 
						|
                                                                        \
 | 
						|
                tp0i OP1 src1r * x1i;                                   \
 | 
						|
                x1 = LD_DP(x +  9 * inc_x2);                            \
 | 
						|
                x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6);    \
 | 
						|
                                                                        \
 | 
						|
                tp1r OP0 src1i * x1i;                                   \
 | 
						|
                t0 = LD_DP(pa0 + 16);                                   \
 | 
						|
                src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6);  \
 | 
						|
                                                                        \
 | 
						|
                tp1i OP2 src1i * x1r;                                   \
 | 
						|
                t1 = LD_DP(pa0 + 18);                                   \
 | 
						|
                src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6);  \
 | 
						|
                                                                        \
 | 
						|
                tp2r += src3r * x3r;                                    \
 | 
						|
                x4 = LD_DP(x + 12 * inc_x2);                            \
 | 
						|
                                                                        \
 | 
						|
                tp2i OP1 src3r * x3i;                                   \
 | 
						|
                x5 = LD_DP(x + 13 * inc_x2);                            \
 | 
						|
                                                                        \
 | 
						|
                tp3r OP0 src3i * x3i;                                   \
 | 
						|
                t4 = LD_DP(pa0 + 24);                                   \
 | 
						|
                                                                        \
 | 
						|
                tp3i OP2 src3i * x3r;                                   \
 | 
						|
                t5 = LD_DP(pa0 + 26);                                   \
 | 
						|
                                                                        \
 | 
						|
                x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0);    \
 | 
						|
                x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0);    \
 | 
						|
                src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0);  \
 | 
						|
                src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0);  \
 | 
						|
                                                                        \
 | 
						|
                tp0r += src0r * x0r;                                    \
 | 
						|
                x2 = LD_DP(x + 10 * inc_x2);                            \
 | 
						|
                x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);    \
 | 
						|
                                                                        \
 | 
						|
                tp0i OP1 src0r * x0i;                                   \
 | 
						|
                x3 = LD_DP(x + 11 * inc_x2);                            \
 | 
						|
                x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4);    \
 | 
						|
                                                                        \
 | 
						|
                tp1r OP0 src0i * x0i;                                   \
 | 
						|
                t2 = LD_DP(pa0 + 20);                                   \
 | 
						|
                src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4);  \
 | 
						|
                                                                        \
 | 
						|
                tp1i OP2 src0i * x0r;                                   \
 | 
						|
                t3 = LD_DP(pa0 + 22);                                   \
 | 
						|
                src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4);  \
 | 
						|
                                                                        \
 | 
						|
                tp2r += src2r * x2r;                                    \
 | 
						|
                x6 = LD_DP(x + 14 * inc_x2);                            \
 | 
						|
                                                                        \
 | 
						|
                tp2i OP1 src2r * x2i;                                   \
 | 
						|
                x7 = LD_DP(x + 15 * inc_x2);                            \
 | 
						|
                                                                        \
 | 
						|
                tp3r OP0 src2i * x2i;                                   \
 | 
						|
                t6 = LD_DP(pa0 + 28);                                   \
 | 
						|
                                                                        \
 | 
						|
                tp3i OP2 src2i * x2r;                                   \
 | 
						|
                t7 = LD_DP(pa0 + 30);                                   \
 | 
						|
                                                                        \
 | 
						|
                x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2);    \
 | 
						|
                x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2);    \
 | 
						|
                src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2);  \
 | 
						|
                src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2);  \
 | 
						|
                                                                        \
 | 
						|
                tp0r += src1r * x1r;                                    \
 | 
						|
                x0 = LD_DP(x + inc_x2 * 16);                            \
 | 
						|
                x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6);    \
 | 
						|
                                                                        \
 | 
						|
                tp0i OP1 src1r * x1i;                                   \
 | 
						|
                x1 = LD_DP(x + inc_x2 * 16 + 1 * inc_x2);               \
 | 
						|
                x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6);    \
 | 
						|
                                                                        \
 | 
						|
                tp1r OP0 src1i * x1i;                                   \
 | 
						|
                t0 = LD_DP(pa0 + 2 * 16);                               \
 | 
						|
                src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6);  \
 | 
						|
                                                                        \
 | 
						|
                tp1i OP2 src1i * x1r;                                   \
 | 
						|
                t1 = LD_DP(pa0 + 2 * 16 + 2);                           \
 | 
						|
                src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6);  \
 | 
						|
                                                                        \
 | 
						|
                tp2r += src3r * x3r;                                    \
 | 
						|
                x4 = LD_DP(x + inc_x2 * 16 + 4 * inc_x2);               \
 | 
						|
                                                                        \
 | 
						|
                tp2i OP1 src3r * x3i;                                   \
 | 
						|
                x5 = LD_DP(x + inc_x2 * 16 + 5 * inc_x2);               \
 | 
						|
                                                                        \
 | 
						|
                tp3r OP0 src3i * x3i;                                   \
 | 
						|
                t4 = LD_DP(pa0 + 2 * 16 + 8);                           \
 | 
						|
                                                                        \
 | 
						|
                tp3i OP2 src3i * x3r;                                   \
 | 
						|
                t5 = LD_DP(pa0 + 2 * 16 + 10);                          \
 | 
						|
                                                                        \
 | 
						|
                pa0 += 2 * 16;                                          \
 | 
						|
                x += inc_x2 * 16;                                       \
 | 
						|
            }                                                           \
 | 
						|
                                                                        \
 | 
						|
            x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0);        \
 | 
						|
            x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0);        \
 | 
						|
            src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0);      \
 | 
						|
            src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0);      \
 | 
						|
                                                                        \
 | 
						|
            tp0r += src0r * x0r;                                        \
 | 
						|
            x2 = LD_DP(x + 2 * inc_x2);                                 \
 | 
						|
            x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);        \
 | 
						|
                                                                        \
 | 
						|
            tp0i OP1 src0r * x0i;                                       \
 | 
						|
            x3 = LD_DP(x + 3 * inc_x2);                                 \
 | 
						|
            x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4);        \
 | 
						|
                                                                        \
 | 
						|
            tp1r OP0 src0i * x0i;                                       \
 | 
						|
            t2 = LD_DP(pa0 + 4);                                        \
 | 
						|
            src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4);      \
 | 
						|
                                                                        \
 | 
						|
            tp1i OP2 src0i * x0r;                                       \
 | 
						|
            t3 = LD_DP(pa0 + 6);                                        \
 | 
						|
            src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4);      \
 | 
						|
                                                                        \
 | 
						|
            tp2r += src2r * x2r;                                        \
 | 
						|
            x6 = LD_DP(x + 6 * inc_x2);                                 \
 | 
						|
                                                                        \
 | 
						|
            tp2i OP1 src2r * x2i;                                       \
 | 
						|
            x7 = LD_DP(x + 7 * inc_x2);                                 \
 | 
						|
                                                                        \
 | 
						|
            tp3r OP0 src2i * x2i;                                       \
 | 
						|
            t6 = LD_DP(pa0 + 12);                                       \
 | 
						|
                                                                        \
 | 
						|
            tp3i OP2 src2i * x2r;                                       \
 | 
						|
            t7 = LD_DP(pa0 + 14);                                       \
 | 
						|
                                                                        \
 | 
						|
            x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2);        \
 | 
						|
            x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2);        \
 | 
						|
            src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2);      \
 | 
						|
            src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2);      \
 | 
						|
                                                                        \
 | 
						|
            tp0r += src1r * x1r;                                        \
 | 
						|
            x0 = LD_DP(x +  8 * inc_x2);                                \
 | 
						|
            x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6);        \
 | 
						|
                                                                        \
 | 
						|
            tp0i OP1 src1r * x1i;                                       \
 | 
						|
            x1 = LD_DP(x +  9 * inc_x2);                                \
 | 
						|
            x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6);        \
 | 
						|
                                                                        \
 | 
						|
            tp1r OP0 src1i * x1i;                                       \
 | 
						|
            t0 = LD_DP(pa0 + 16);                                       \
 | 
						|
            src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6);      \
 | 
						|
                                                                        \
 | 
						|
            tp1i OP2 src1i * x1r;                                       \
 | 
						|
            t1 = LD_DP(pa0 + 18);                                       \
 | 
						|
            src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6);      \
 | 
						|
                                                                        \
 | 
						|
            tp2r += src3r * x3r;                                        \
 | 
						|
            x4 = LD_DP(x + 12 * inc_x2);                                \
 | 
						|
                                                                        \
 | 
						|
            tp2i OP1 src3r * x3i;                                       \
 | 
						|
            x5 = LD_DP(x + 13 * inc_x2);                                \
 | 
						|
                                                                        \
 | 
						|
            tp3r OP0 src3i * x3i;                                       \
 | 
						|
            t4 = LD_DP(pa0 + 24);                                       \
 | 
						|
                                                                        \
 | 
						|
            tp3i OP2 src3i * x3r;                                       \
 | 
						|
            t5 = LD_DP(pa0 + 26);                                       \
 | 
						|
                                                                        \
 | 
						|
            x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0);        \
 | 
						|
            x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0);        \
 | 
						|
            src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0);      \
 | 
						|
            src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0);      \
 | 
						|
                                                                        \
 | 
						|
            tp0r += src0r * x0r;                                        \
 | 
						|
            x2 = LD_DP(x + 10 * inc_x2);                                \
 | 
						|
            x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);        \
 | 
						|
                                                                        \
 | 
						|
            tp0i OP1 src0r * x0i;                                       \
 | 
						|
            x3 = LD_DP(x + 11 * inc_x2);                                \
 | 
						|
            x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4);        \
 | 
						|
                                                                        \
 | 
						|
            tp1r OP0 src0i * x0i;                                       \
 | 
						|
            t2 = LD_DP(pa0 + 20);                                       \
 | 
						|
            src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4);      \
 | 
						|
                                                                        \
 | 
						|
            tp1i OP2 src0i * x0r;                                       \
 | 
						|
            t3 = LD_DP(pa0 + 22);                                       \
 | 
						|
            src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4);      \
 | 
						|
                                                                        \
 | 
						|
            tp2r += src2r * x2r;                                        \
 | 
						|
            x6 = LD_DP(x + 14 * inc_x2);                                \
 | 
						|
                                                                        \
 | 
						|
            tp2i OP1 src2r * x2i;                                       \
 | 
						|
            x7 = LD_DP(x + 15 * inc_x2);                                \
 | 
						|
                                                                        \
 | 
						|
            tp3r OP0 src2i * x2i;                                       \
 | 
						|
            t6 = LD_DP(pa0 + 28);                                       \
 | 
						|
                                                                        \
 | 
						|
            tp3i OP2 src2i * x2r;                                       \
 | 
						|
            t7 = LD_DP(pa0 + 30);                                       \
 | 
						|
                                                                        \
 | 
						|
            x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2);        \
 | 
						|
            x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2);        \
 | 
						|
            src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2);      \
 | 
						|
            src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2);      \
 | 
						|
                                                                        \
 | 
						|
            tp0r += src1r * x1r;                                        \
 | 
						|
            x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6);        \
 | 
						|
                                                                        \
 | 
						|
            tp0i OP1 src1r * x1i;                                       \
 | 
						|
            x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6);        \
 | 
						|
                                                                        \
 | 
						|
            tp1r OP0 src1i * x1i;                                       \
 | 
						|
            src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6);      \
 | 
						|
                                                                        \
 | 
						|
            tp1i OP2 src1i * x1r;                                       \
 | 
						|
            src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6);      \
 | 
						|
                                                                        \
 | 
						|
            tp2r += src3r * x3r;                                        \
 | 
						|
            tp2i OP1 src3r * x3i;                                       \
 | 
						|
            tp3r OP0 src3i * x3i;                                       \
 | 
						|
            tp3i OP2 src3i * x3r;                                       \
 | 
						|
                                                                        \
 | 
						|
            pa0 += 2 * 16;                                              \
 | 
						|
            x += inc_x2 * 16;                                           \
 | 
						|
                                                                        \
 | 
						|
            tp0r += tp1r + tp2r + tp3r;                                 \
 | 
						|
            tp0i += tp1i + tp2i + tp3i;                                 \
 | 
						|
        }                                                               \
 | 
						|
                                                                        \
 | 
						|
        if (m & 8)                                                      \
 | 
						|
        {                                                               \
 | 
						|
            ZLOAD_X8();                                                 \
 | 
						|
            ZGEMV_T_8x1();                                              \
 | 
						|
                                                                        \
 | 
						|
            pa0 += 2 * 8;                                               \
 | 
						|
            x += inc_x2 * 8;                                            \
 | 
						|
        }                                                               \
 | 
						|
                                                                        \
 | 
						|
        if (m & 4)                                                      \
 | 
						|
        {                                                               \
 | 
						|
            ZLOAD_X4();                                                 \
 | 
						|
            ZGEMV_T_4x1();                                              \
 | 
						|
                                                                        \
 | 
						|
            pa0 += 2 * 4;                                               \
 | 
						|
            x += inc_x2 * 4;                                            \
 | 
						|
        }                                                               \
 | 
						|
                                                                        \
 | 
						|
        if (m & 2)                                                      \
 | 
						|
        {                                                               \
 | 
						|
            ZLOAD_X2();                                                 \
 | 
						|
            ZGEMV_T_2x1();                                              \
 | 
						|
                                                                        \
 | 
						|
            pa0 += 2 * 2;                                               \
 | 
						|
            x += inc_x2 * 2;                                            \
 | 
						|
        }                                                               \
 | 
						|
                                                                        \
 | 
						|
        temp0r = tp0r[0] + tp0r[1];                                     \
 | 
						|
        temp0i = tp0i[0] + tp0i[1];                                     \
 | 
						|
                                                                        \
 | 
						|
        if (m & 1)                                                      \
 | 
						|
        {                                                               \
 | 
						|
            ZGEMV_T_1x1();                                              \
 | 
						|
                                                                        \
 | 
						|
            pa0 += 2;                                                   \
 | 
						|
            x += inc_x2;                                                \
 | 
						|
        }                                                               \
 | 
						|
                                                                        \
 | 
						|
        ZSCALE_STORE_Y1_GP();                                           \
 | 
						|
                                                                        \
 | 
						|
        A += lda2;                                                      \
 | 
						|
        y += inc_y2;                                                    \
 | 
						|
    }                                                                   \
 | 
						|
 | 
						|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
 | 
						|
          FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
 | 
						|
          BLASLONG inc_y, FLOAT *buffer)
 | 
						|
{
 | 
						|
    BLASLONG i, j, pref_offset;
 | 
						|
    BLASLONG inc_x2, inc_y2, lda2;
 | 
						|
    FLOAT *pa0, *pa0_pref;
 | 
						|
    FLOAT *srcx_org = x;
 | 
						|
    FLOAT temp0r, temp0i;
 | 
						|
    FLOAT res0r, res0i;
 | 
						|
    v2f64 zero = {0};
 | 
						|
    v2f64 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
 | 
						|
    v2f64 x4, x5, x6, x7, x2r, x3r, x2i, x3i;
 | 
						|
    v2f64 t0, t1, t2, t3, t4, t5, t6, t7;
 | 
						|
    v2f64 src0r, src1r, src2r, src3r;
 | 
						|
    v2f64 src0i, src1i, src2i, src3i;
 | 
						|
    v2f64 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
 | 
						|
 | 
						|
    lda2 = 2 * lda;
 | 
						|
 | 
						|
    inc_x2 = 2 * inc_x;
 | 
						|
    inc_y2 = 2 * inc_y;
 | 
						|
 | 
						|
    pref_offset = (uintptr_t)A & L1_DATA_LINESIZE;
 | 
						|
    pref_offset = L1_DATA_LINESIZE - pref_offset;
 | 
						|
    pref_offset = pref_offset / sizeof(FLOAT);
 | 
						|
 | 
						|
    if (2 == inc_x2)
 | 
						|
    {
 | 
						|
        #define ZLOAD_X8  ZLOAD_X8_VECTOR
 | 
						|
        #define ZLOAD_X4  ZLOAD_X4_VECTOR
 | 
						|
        #define ZLOAD_X2  ZLOAD_X2_VECTOR
 | 
						|
 | 
						|
        ZGEMV_T_MSA();
 | 
						|
 | 
						|
        #undef ZLOAD_X8
 | 
						|
        #undef ZLOAD_X4
 | 
						|
        #undef ZLOAD_X2
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        #define ZLOAD_X8  ZLOAD_X8_GP
 | 
						|
        #define ZLOAD_X4  ZLOAD_X4_GP
 | 
						|
        #define ZLOAD_X2  ZLOAD_X2_GP
 | 
						|
 | 
						|
        ZGEMV_T_MSA();
 | 
						|
 | 
						|
        #undef ZLOAD_X8
 | 
						|
        #undef ZLOAD_X4
 | 
						|
        #undef ZLOAD_X2
 | 
						|
    }
 | 
						|
    return(0);
 | 
						|
}
 | 
						|
 | 
						|
#undef OP0
 | 
						|
#undef OP1
 | 
						|
#undef OP2
 |