1874 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			C
		
	
	
	
			
		
		
	
	
			1874 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			C
		
	
	
	
/*********************************************************************/
 | 
						|
/* Copyright 2009, 2010 The University of Texas at Austin.           */
 | 
						|
/* All rights reserved.                                              */
 | 
						|
/*                                                                   */
 | 
						|
/* Redistribution and use in source and binary forms, with or        */
 | 
						|
/* without modification, are permitted provided that the following   */
 | 
						|
/* conditions are met:                                               */
 | 
						|
/*                                                                   */
 | 
						|
/*   1. Redistributions of source code must retain the above         */
 | 
						|
/*      copyright notice, this list of conditions and the following  */
 | 
						|
/*      disclaimer.                                                  */
 | 
						|
/*                                                                   */
 | 
						|
/*   2. Redistributions in binary form must reproduce the above      */
 | 
						|
/*      copyright notice, this list of conditions and the following  */
 | 
						|
/*      disclaimer in the documentation and/or other materials       */
 | 
						|
/*      provided with the distribution.                              */
 | 
						|
/*                                                                   */
 | 
						|
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 | 
						|
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 | 
						|
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 | 
						|
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 | 
						|
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 | 
						|
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 | 
						|
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 | 
						|
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 | 
						|
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 | 
						|
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 | 
						|
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 | 
						|
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 | 
						|
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 | 
						|
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
 | 
						|
/*                                                                   */
 | 
						|
/* The views and conclusions contained in the software and           */
 | 
						|
/* documentation are those of the authors and should not be          */
 | 
						|
/* interpreted as representing official policies, either expressed   */
 | 
						|
/* or implied, of The University of Texas at Austin.                 */
 | 
						|
/*********************************************************************/
 | 
						|
 | 
						|
/* This implementation is completely wrong. I'll rewrite this */
 | 
						|
 | 
						|
#ifndef SYMCOPY_H
 | 
						|
#define SYMCOPY_H
 | 
						|
 | 
						|
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
 | 
						|
 | 
						|
static __inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a12;
 | 
						|
  FLOAT a21, a22;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda + 2;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 1 * m;
 | 
						|
    b1 += 2 * m + 2;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 1 * m;
 | 
						|
    b2 += 2 * m + 2;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
      *(bb2 + 0) = a21;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
      aa1 += 2;
 | 
						|
      aa2 += 2;
 | 
						|
      bb1 += 2;
 | 
						|
      bb2 += 2;
 | 
						|
 | 
						|
      cc1 += 2 * m;
 | 
						|
      cc2 += 2 * m;
 | 
						|
 | 
						|
      is = ((m - js - 2) >> 1);
 | 
						|
 | 
						|
      while (is > 0){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	aa1 += 2;
 | 
						|
	aa2 += 2;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a12;
 | 
						|
	*(cc2 + 0) = a21;
 | 
						|
	*(cc2 + 1) = a22;
 | 
						|
 | 
						|
	bb1 += 2;
 | 
						|
	bb2 += 2;
 | 
						|
 | 
						|
	cc1 += 2 * m;
 | 
						|
	cc2 += 2 * m;
 | 
						|
 | 
						|
	is --;
 | 
						|
      }
 | 
						|
 | 
						|
      is = ((m - js - 2) & 1);
 | 
						|
 | 
						|
      if (is == 1){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a12;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
    }
 | 
						|
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a12;
 | 
						|
  FLOAT a21, a22;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 1 * m;
 | 
						|
    b1 += 2 * m;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 1 * m;
 | 
						|
    b2 += 2;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	aa1 += 2;
 | 
						|
	aa2 += 2;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a12;
 | 
						|
	*(cc2 + 0) = a21;
 | 
						|
	*(cc2 + 1) = a22;
 | 
						|
 | 
						|
	bb1 += 2;
 | 
						|
	bb2 += 2;
 | 
						|
 | 
						|
	cc1 += 2 * m;
 | 
						|
	cc2 += 2 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
 | 
						|
      a12 = *(aa2 + 0);
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a12;
 | 
						|
      *(bb2 + 0) = a12;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	aa1 += 2;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc2 + 0) = a21;
 | 
						|
	bb1 += 2;
 | 
						|
 | 
						|
	cc1 += 2 * m;
 | 
						|
	cc2 += 2 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
static __inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda + 4;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m + 4;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4 * m + 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
      a31 = *(aa1 + 2);
 | 
						|
      a41 = *(aa1 + 3);
 | 
						|
 | 
						|
      a12 = *(aa2 + 2);
 | 
						|
      a22 = *(aa2 + 3);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
      *(bb1 + 2) = a31;
 | 
						|
      *(bb1 + 3) = a41;
 | 
						|
 | 
						|
      *(bb2 + 0) = a31;
 | 
						|
      *(bb2 + 1) = a41;
 | 
						|
      *(bb2 + 2) = a12;
 | 
						|
      *(bb2 + 3) = a22;
 | 
						|
 | 
						|
      aa1 += 4;
 | 
						|
      aa2 += 4;
 | 
						|
      bb1 += 4;
 | 
						|
      bb2 += 4;
 | 
						|
 | 
						|
      cc1 += 4 * m;
 | 
						|
      cc2 += 4 * m;
 | 
						|
 | 
						|
      is = ((m - js - 2) >> 1);
 | 
						|
 | 
						|
      while (is > 0){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
 | 
						|
	is --;
 | 
						|
      }
 | 
						|
 | 
						|
      if (m & 1){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
    }
 | 
						|
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
 | 
						|
      a12 = *(aa2 + 0);
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
      a32 = *(aa2 + 2);
 | 
						|
      a42 = *(aa2 + 3);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
      *(bb1 + 2) = a12;
 | 
						|
      *(bb1 + 3) = a22;
 | 
						|
 | 
						|
      *(bb2 + 0) = a12;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
      *(bb2 + 2) = a32;
 | 
						|
      *(bb2 + 3) = a42;
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
	aa1 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	bb1 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda + 4;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m + 4;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4 * m + 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a31 = *(aa1 + 2);
 | 
						|
      a41 = *(aa1 + 3);
 | 
						|
 | 
						|
      a12 = *(aa2 + 2);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = 0.;
 | 
						|
      *(bb1 + 2) = a31;
 | 
						|
      *(bb1 + 3) = a41;
 | 
						|
 | 
						|
      *(bb2 + 0) = a31;
 | 
						|
      *(bb2 + 1) = -a41;
 | 
						|
      *(bb2 + 2) = a12;
 | 
						|
      *(bb2 + 3) = 0.;
 | 
						|
 | 
						|
      aa1 += 4;
 | 
						|
      aa2 += 4;
 | 
						|
      bb1 += 4;
 | 
						|
      bb2 += 4;
 | 
						|
 | 
						|
      cc1 += 4 * m;
 | 
						|
      cc2 += 4 * m;
 | 
						|
 | 
						|
      is = ((m - js - 2) >> 1);
 | 
						|
 | 
						|
      while (is > 0){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = -a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = -a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = -a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = -a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
 | 
						|
	is --;
 | 
						|
      }
 | 
						|
 | 
						|
      if (m & 1){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = -a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = -a22;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = 0.;
 | 
						|
    }
 | 
						|
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = -a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = -a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = -a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = -a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
 | 
						|
      a12 = *(aa2 + 0);
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
      a32 = *(aa2 + 2);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = 0.;
 | 
						|
      *(bb1 + 2) = a12;
 | 
						|
      *(bb1 + 3) = -a22;
 | 
						|
 | 
						|
      *(bb2 + 0) = a12;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
      *(bb2 + 2) = a32;
 | 
						|
      *(bb2 + 3) = 0.;
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
	aa1 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = -a21;
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = -a41;
 | 
						|
	bb1 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = 0.;
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
static __inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda + 4;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m + 4;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4 * m + 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a31 = *(aa1 + 2);
 | 
						|
      a41 = *(aa1 + 3);
 | 
						|
 | 
						|
      a12 = *(aa2 + 2);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = 0.;
 | 
						|
      *(bb1 + 2) = a31;
 | 
						|
      *(bb1 + 3) = -a41;
 | 
						|
 | 
						|
      *(bb2 + 0) = a31;
 | 
						|
      *(bb2 + 1) = a41;
 | 
						|
      *(bb2 + 2) = a12;
 | 
						|
      *(bb2 + 3) = 0.;
 | 
						|
 | 
						|
      aa1 += 4;
 | 
						|
      aa2 += 4;
 | 
						|
      bb1 += 4;
 | 
						|
      bb2 += 4;
 | 
						|
 | 
						|
      cc1 += 4 * m;
 | 
						|
      cc2 += 4 * m;
 | 
						|
 | 
						|
      is = ((m - js - 2) >> 1);
 | 
						|
 | 
						|
      while (is > 0){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = -a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = -a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = -a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = -a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
 | 
						|
	is --;
 | 
						|
      }
 | 
						|
 | 
						|
      if (m & 1){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = -a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = -a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = 0.;
 | 
						|
    }
 | 
						|
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = -a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = -a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = -a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = -a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
 | 
						|
      a12 = *(aa2 + 0);
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
      a32 = *(aa2 + 2);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = 0.;
 | 
						|
      *(bb1 + 2) = a12;
 | 
						|
      *(bb1 + 3) = a22;
 | 
						|
 | 
						|
      *(bb2 + 0) = a12;
 | 
						|
      *(bb2 + 1) = -a22;
 | 
						|
      *(bb2 + 2) = a32;
 | 
						|
      *(bb2 + 3) = 0.;
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
	aa1 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = -a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = -a41;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	bb1 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = 0.;
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
static __inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a12;
 | 
						|
  FLOAT a21, a22;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda + 2;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 1 * m;
 | 
						|
    b1 += 2 * m + 2;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 1 * m;
 | 
						|
    b2 += 2 * m + 2;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
      *(bb2 + 0) = a21;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
      aa1 += 2;
 | 
						|
      aa2 += 2;
 | 
						|
      bb1 += 2;
 | 
						|
      bb2 += 2;
 | 
						|
 | 
						|
      cc1 += 2 * m;
 | 
						|
      cc2 += 2 * m;
 | 
						|
 | 
						|
      is = ((m - js - 2) >> 1);
 | 
						|
 | 
						|
      while (is > 0){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	aa1 += 2;
 | 
						|
	aa2 += 2;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a12;
 | 
						|
	*(cc2 + 0) = a21;
 | 
						|
	*(cc2 + 1) = a22;
 | 
						|
 | 
						|
	bb1 += 2;
 | 
						|
	bb2 += 2;
 | 
						|
 | 
						|
	cc1 += 2 * m;
 | 
						|
	cc2 += 2 * m;
 | 
						|
 | 
						|
	is --;
 | 
						|
      }
 | 
						|
 | 
						|
      is = ((m - js - 2) & 1);
 | 
						|
 | 
						|
      if (is == 1){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a12;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
    }
 | 
						|
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a12;
 | 
						|
  FLOAT a21, a22;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda + 2;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 1 * m;
 | 
						|
    b1 += 2 * m + 2;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 1 * m;
 | 
						|
    b2 += 2 * m + 2;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
      *(bb2 + 0) = a21;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
      aa1 += 2;
 | 
						|
      aa2 += 2;
 | 
						|
      bb1 += 2;
 | 
						|
      bb2 += 2;
 | 
						|
 | 
						|
      cc1 += 2 * m;
 | 
						|
      cc2 += 2 * m;
 | 
						|
 | 
						|
      is = ((m - js - 2) >> 1);
 | 
						|
 | 
						|
      while (is > 0){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	aa1 += 2;
 | 
						|
	aa2 += 2;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a12;
 | 
						|
	*(cc2 + 0) = a21;
 | 
						|
	*(cc2 + 1) = a22;
 | 
						|
 | 
						|
	bb1 += 2;
 | 
						|
	bb2 += 2;
 | 
						|
 | 
						|
	cc1 += 2 * m;
 | 
						|
	cc2 += 2 * m;
 | 
						|
 | 
						|
	is --;
 | 
						|
      }
 | 
						|
 | 
						|
      is = ((m - js - 2) & 1);
 | 
						|
 | 
						|
      if (is == 1){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a12;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
    }
 | 
						|
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a12;
 | 
						|
  FLOAT a21, a22;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 1 * m;
 | 
						|
    b1 += 2 * m;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 1 * m;
 | 
						|
    b2 += 2;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	aa1 += 2;
 | 
						|
	aa2 += 2;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a12;
 | 
						|
	*(cc2 + 0) = a21;
 | 
						|
	*(cc2 + 1) = a22;
 | 
						|
 | 
						|
	bb1 += 2;
 | 
						|
	bb2 += 2;
 | 
						|
 | 
						|
	cc1 += 2 * m;
 | 
						|
	cc2 += 2 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
 | 
						|
      a12 = *(aa2 + 0);
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a12;
 | 
						|
      *(bb2 + 0) = a12;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	aa1 += 2;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc2 + 0) = a21;
 | 
						|
	bb1 += 2;
 | 
						|
 | 
						|
	cc1 += 2 * m;
 | 
						|
	cc2 += 2 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a12;
 | 
						|
  FLOAT a21, a22;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 1 * m;
 | 
						|
    b1 += 2 * m;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 1 * m;
 | 
						|
    b2 += 2;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	aa1 += 2;
 | 
						|
	aa2 += 2;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a12;
 | 
						|
	*(cc2 + 0) = a21;
 | 
						|
	*(cc2 + 1) = a22;
 | 
						|
 | 
						|
	bb1 += 2;
 | 
						|
	bb2 += 2;
 | 
						|
 | 
						|
	cc1 += 2 * m;
 | 
						|
	cc2 += 2 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
 | 
						|
      a12 = *(aa2 + 0);
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a12;
 | 
						|
      *(bb2 + 0) = a12;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	aa1 += 2;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc2 + 0) = a21;
 | 
						|
	bb1 += 2;
 | 
						|
 | 
						|
	cc1 += 2 * m;
 | 
						|
	cc2 += 2 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda + 4;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m + 4;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4 * m + 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
      a31 = *(aa1 + 2);
 | 
						|
      a41 = *(aa1 + 3);
 | 
						|
 | 
						|
      a12 = *(aa2 + 2);
 | 
						|
      a22 = *(aa2 + 3);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
      *(bb1 + 2) = a31;
 | 
						|
      *(bb1 + 3) = a41;
 | 
						|
 | 
						|
      *(bb2 + 0) = a31;
 | 
						|
      *(bb2 + 1) = a41;
 | 
						|
      *(bb2 + 2) = a12;
 | 
						|
      *(bb2 + 3) = a22;
 | 
						|
 | 
						|
      aa1 += 4;
 | 
						|
      aa2 += 4;
 | 
						|
      bb1 += 4;
 | 
						|
      bb2 += 4;
 | 
						|
 | 
						|
      cc1 += 4 * m;
 | 
						|
      cc2 += 4 * m;
 | 
						|
 | 
						|
      is = ((m - js - 2) >> 1);
 | 
						|
 | 
						|
      while (is > 0){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
 | 
						|
	is --;
 | 
						|
      }
 | 
						|
 | 
						|
      if (m & 1){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
    }
 | 
						|
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda + 4;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m + 4;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4 * m + 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
      a31 = *(aa1 + 2);
 | 
						|
      a41 = *(aa1 + 3);
 | 
						|
 | 
						|
      a12 = *(aa2 + 2);
 | 
						|
      a22 = *(aa2 + 3);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
      *(bb1 + 2) = a31;
 | 
						|
      *(bb1 + 3) = a41;
 | 
						|
 | 
						|
      *(bb2 + 0) = a31;
 | 
						|
      *(bb2 + 1) = a41;
 | 
						|
      *(bb2 + 2) = a12;
 | 
						|
      *(bb2 + 3) = a22;
 | 
						|
 | 
						|
      aa1 += 4;
 | 
						|
      aa2 += 4;
 | 
						|
      bb1 += 4;
 | 
						|
      bb2 += 4;
 | 
						|
 | 
						|
      cc1 += 4 * m;
 | 
						|
      cc2 += 4 * m;
 | 
						|
 | 
						|
      is = ((m - js - 2) >> 1);
 | 
						|
 | 
						|
      while (is > 0){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
 | 
						|
	is --;
 | 
						|
      }
 | 
						|
 | 
						|
      if (m & 1){
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
    }
 | 
						|
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
 | 
						|
      a12 = *(aa2 + 0);
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
      a32 = *(aa2 + 2);
 | 
						|
      a42 = *(aa2 + 3);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
      *(bb1 + 2) = a12;
 | 
						|
      *(bb1 + 3) = a22;
 | 
						|
 | 
						|
      *(bb2 + 0) = a12;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
      *(bb2 + 2) = a32;
 | 
						|
      *(bb2 + 3) = a42;
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
	aa1 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	bb1 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static __inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | 
						|
  BLASLONG is, js;
 | 
						|
 | 
						|
  FLOAT *aa1, *aa2;
 | 
						|
  FLOAT *b1, *b2;
 | 
						|
  FLOAT *bb1, *bb2;
 | 
						|
  FLOAT *cc1, *cc2;
 | 
						|
  FLOAT a11, a21, a31, a41;
 | 
						|
  FLOAT a12, a22, a32, a42;
 | 
						|
 | 
						|
  b1 = b;
 | 
						|
  b2 = b;
 | 
						|
 | 
						|
  lda *= 2;
 | 
						|
 | 
						|
  for (js = 0; js < m; js += 2){
 | 
						|
 | 
						|
    aa1 = a + 0 * lda;
 | 
						|
    aa2 = a + 1 * lda;
 | 
						|
    a  += 2 * lda;
 | 
						|
 | 
						|
    bb1 = b1 + 0 * m;
 | 
						|
    bb2 = b1 + 2 * m;
 | 
						|
    b1 += 4 * m;
 | 
						|
 | 
						|
    cc1 = b2 + 0 * m;
 | 
						|
    cc2 = b2 + 2 * m;
 | 
						|
    b2 += 4;
 | 
						|
 | 
						|
    if (m - js >= 2){
 | 
						|
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
 | 
						|
	a12 = *(aa2 + 0);
 | 
						|
	a22 = *(aa2 + 1);
 | 
						|
	a32 = *(aa2 + 2);
 | 
						|
	a42 = *(aa2 + 3);
 | 
						|
 | 
						|
	aa1 += 4;
 | 
						|
	aa2 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(bb2 + 0) = a12;
 | 
						|
	*(bb2 + 1) = a22;
 | 
						|
	*(bb2 + 2) = a32;
 | 
						|
	*(bb2 + 3) = a42;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc1 + 2) = a12;
 | 
						|
	*(cc1 + 3) = a22;
 | 
						|
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	*(cc2 + 2) = a32;
 | 
						|
	*(cc2 + 3) = a42;
 | 
						|
 | 
						|
	bb1 += 4;
 | 
						|
	bb2 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
 | 
						|
      a12 = *(aa2 + 0);
 | 
						|
      a22 = *(aa2 + 1);
 | 
						|
      a32 = *(aa2 + 2);
 | 
						|
      a42 = *(aa2 + 3);
 | 
						|
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
      *(bb1 + 2) = a12;
 | 
						|
      *(bb1 + 3) = a22;
 | 
						|
 | 
						|
      *(bb2 + 0) = a12;
 | 
						|
      *(bb2 + 1) = a22;
 | 
						|
      *(bb2 + 2) = a32;
 | 
						|
      *(bb2 + 3) = a42;
 | 
						|
    }
 | 
						|
 | 
						|
    if (m - js == 1){
 | 
						|
      for (is = 0; is < js; is += 2){
 | 
						|
 | 
						|
	a11 = *(aa1 + 0);
 | 
						|
	a21 = *(aa1 + 1);
 | 
						|
	a31 = *(aa1 + 2);
 | 
						|
	a41 = *(aa1 + 3);
 | 
						|
	aa1 += 4;
 | 
						|
 | 
						|
	*(bb1 + 0) = a11;
 | 
						|
	*(bb1 + 1) = a21;
 | 
						|
	*(bb1 + 2) = a31;
 | 
						|
	*(bb1 + 3) = a41;
 | 
						|
 | 
						|
	*(cc1 + 0) = a11;
 | 
						|
	*(cc1 + 1) = a21;
 | 
						|
	*(cc2 + 0) = a31;
 | 
						|
	*(cc2 + 1) = a41;
 | 
						|
	bb1 += 4;
 | 
						|
 | 
						|
	cc1 += 4 * m;
 | 
						|
	cc2 += 4 * m;
 | 
						|
      }
 | 
						|
 | 
						|
      a11 = *(aa1 + 0);
 | 
						|
      a21 = *(aa1 + 1);
 | 
						|
      *(bb1 + 0) = a11;
 | 
						|
      *(bb1 + 1) = a21;
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
 |