1874 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			C
		
	
	
	
			
		
		
	
	
			1874 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			C
		
	
	
	
| /*********************************************************************/
 | |
| /* Copyright 2009, 2010 The University of Texas at Austin.           */
 | |
| /* All rights reserved.                                              */
 | |
| /*                                                                   */
 | |
| /* Redistribution and use in source and binary forms, with or        */
 | |
| /* without modification, are permitted provided that the following   */
 | |
| /* conditions are met:                                               */
 | |
| /*                                                                   */
 | |
| /*   1. Redistributions of source code must retain the above         */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer.                                                  */
 | |
| /*                                                                   */
 | |
| /*   2. Redistributions in binary form must reproduce the above      */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer in the documentation and/or other materials       */
 | |
| /*      provided with the distribution.                              */
 | |
| /*                                                                   */
 | |
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 | |
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 | |
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 | |
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 | |
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 | |
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 | |
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 | |
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 | |
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 | |
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 | |
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 | |
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 | |
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 | |
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 | |
| /*                                                                   */
 | |
| /* The views and conclusions contained in the software and           */
 | |
| /* documentation are those of the authors and should not be          */
 | |
| /* interpreted as representing official policies, either expressed   */
 | |
| /* or implied, of The University of Texas at Austin.                 */
 | |
| /*********************************************************************/
 | |
| 
 | |
| /* This implementation is completely wrong. I'll rewrite this */
 | |
| 
 | |
| #ifndef SYMCOPY_H
 | |
| #define SYMCOPY_H
 | |
| 
 | |
| #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
 | |
| 
 | |
| static __inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a12;
 | |
|   FLOAT a21, a22;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda + 2;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 1 * m;
 | |
|     b1 += 2 * m + 2;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 1 * m;
 | |
|     b2 += 2 * m + 2;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
| 
 | |
|       a22 = *(aa2 + 1);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|       *(bb2 + 0) = a21;
 | |
|       *(bb2 + 1) = a22;
 | |
|       aa1 += 2;
 | |
|       aa2 += 2;
 | |
|       bb1 += 2;
 | |
|       bb2 += 2;
 | |
| 
 | |
|       cc1 += 2 * m;
 | |
|       cc2 += 2 * m;
 | |
| 
 | |
|       is = ((m - js - 2) >> 1);
 | |
| 
 | |
|       while (is > 0){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	aa1 += 2;
 | |
| 	aa2 += 2;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a12;
 | |
| 	*(cc2 + 0) = a21;
 | |
| 	*(cc2 + 1) = a22;
 | |
| 
 | |
| 	bb1 += 2;
 | |
| 	bb2 += 2;
 | |
| 
 | |
| 	cc1 += 2 * m;
 | |
| 	cc2 += 2 * m;
 | |
| 
 | |
| 	is --;
 | |
|       }
 | |
| 
 | |
|       is = ((m - js - 2) & 1);
 | |
| 
 | |
|       if (is == 1){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a12;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|     }
 | |
| 
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a12;
 | |
|   FLOAT a21, a22;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 1 * m;
 | |
|     b1 += 2 * m;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 1 * m;
 | |
|     b2 += 2;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	aa1 += 2;
 | |
| 	aa2 += 2;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a12;
 | |
| 	*(cc2 + 0) = a21;
 | |
| 	*(cc2 + 1) = a22;
 | |
| 
 | |
| 	bb1 += 2;
 | |
| 	bb2 += 2;
 | |
| 
 | |
| 	cc1 += 2 * m;
 | |
| 	cc2 += 2 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
| 
 | |
|       a12 = *(aa2 + 0);
 | |
|       a22 = *(aa2 + 1);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a12;
 | |
|       *(bb2 + 0) = a12;
 | |
|       *(bb2 + 1) = a22;
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	aa1 += 2;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc2 + 0) = a21;
 | |
| 	bb1 += 2;
 | |
| 
 | |
| 	cc1 += 2 * m;
 | |
| 	cc2 += 2 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static __inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda + 4;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m + 4;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4 * m + 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
|       a31 = *(aa1 + 2);
 | |
|       a41 = *(aa1 + 3);
 | |
| 
 | |
|       a12 = *(aa2 + 2);
 | |
|       a22 = *(aa2 + 3);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|       *(bb1 + 2) = a31;
 | |
|       *(bb1 + 3) = a41;
 | |
| 
 | |
|       *(bb2 + 0) = a31;
 | |
|       *(bb2 + 1) = a41;
 | |
|       *(bb2 + 2) = a12;
 | |
|       *(bb2 + 3) = a22;
 | |
| 
 | |
|       aa1 += 4;
 | |
|       aa2 += 4;
 | |
|       bb1 += 4;
 | |
|       bb2 += 4;
 | |
| 
 | |
|       cc1 += 4 * m;
 | |
|       cc2 += 4 * m;
 | |
| 
 | |
|       is = ((m - js - 2) >> 1);
 | |
| 
 | |
|       while (is > 0){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
| 
 | |
| 	is --;
 | |
|       }
 | |
| 
 | |
|       if (m & 1){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|     }
 | |
| 
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
| 
 | |
|       a12 = *(aa2 + 0);
 | |
|       a22 = *(aa2 + 1);
 | |
|       a32 = *(aa2 + 2);
 | |
|       a42 = *(aa2 + 3);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|       *(bb1 + 2) = a12;
 | |
|       *(bb1 + 3) = a22;
 | |
| 
 | |
|       *(bb2 + 0) = a12;
 | |
|       *(bb2 + 1) = a22;
 | |
|       *(bb2 + 2) = a32;
 | |
|       *(bb2 + 3) = a42;
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 	aa1 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	bb1 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda + 4;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m + 4;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4 * m + 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a31 = *(aa1 + 2);
 | |
|       a41 = *(aa1 + 3);
 | |
| 
 | |
|       a12 = *(aa2 + 2);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = 0.;
 | |
|       *(bb1 + 2) = a31;
 | |
|       *(bb1 + 3) = a41;
 | |
| 
 | |
|       *(bb2 + 0) = a31;
 | |
|       *(bb2 + 1) = -a41;
 | |
|       *(bb2 + 2) = a12;
 | |
|       *(bb2 + 3) = 0.;
 | |
| 
 | |
|       aa1 += 4;
 | |
|       aa2 += 4;
 | |
|       bb1 += 4;
 | |
|       bb2 += 4;
 | |
| 
 | |
|       cc1 += 4 * m;
 | |
|       cc2 += 4 * m;
 | |
| 
 | |
|       is = ((m - js - 2) >> 1);
 | |
| 
 | |
|       while (is > 0){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = -a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = -a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = -a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = -a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
| 
 | |
| 	is --;
 | |
|       }
 | |
| 
 | |
|       if (m & 1){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = -a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = -a22;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = 0.;
 | |
|     }
 | |
| 
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = -a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = -a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = -a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = -a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
| 
 | |
|       a12 = *(aa2 + 0);
 | |
|       a22 = *(aa2 + 1);
 | |
|       a32 = *(aa2 + 2);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = 0.;
 | |
|       *(bb1 + 2) = a12;
 | |
|       *(bb1 + 3) = -a22;
 | |
| 
 | |
|       *(bb2 + 0) = a12;
 | |
|       *(bb2 + 1) = a22;
 | |
|       *(bb2 + 2) = a32;
 | |
|       *(bb2 + 3) = 0.;
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 	aa1 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = -a21;
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = -a41;
 | |
| 	bb1 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = 0.;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static __inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda + 4;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m + 4;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4 * m + 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a31 = *(aa1 + 2);
 | |
|       a41 = *(aa1 + 3);
 | |
| 
 | |
|       a12 = *(aa2 + 2);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = 0.;
 | |
|       *(bb1 + 2) = a31;
 | |
|       *(bb1 + 3) = -a41;
 | |
| 
 | |
|       *(bb2 + 0) = a31;
 | |
|       *(bb2 + 1) = a41;
 | |
|       *(bb2 + 2) = a12;
 | |
|       *(bb2 + 3) = 0.;
 | |
| 
 | |
|       aa1 += 4;
 | |
|       aa2 += 4;
 | |
|       bb1 += 4;
 | |
|       bb2 += 4;
 | |
| 
 | |
|       cc1 += 4 * m;
 | |
|       cc2 += 4 * m;
 | |
| 
 | |
|       is = ((m - js - 2) >> 1);
 | |
| 
 | |
|       while (is > 0){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = -a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = -a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = -a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = -a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
| 
 | |
| 	is --;
 | |
|       }
 | |
| 
 | |
|       if (m & 1){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = -a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = -a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = 0.;
 | |
|     }
 | |
| 
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = -a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = -a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = -a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = -a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
| 
 | |
|       a12 = *(aa2 + 0);
 | |
|       a22 = *(aa2 + 1);
 | |
|       a32 = *(aa2 + 2);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = 0.;
 | |
|       *(bb1 + 2) = a12;
 | |
|       *(bb1 + 3) = a22;
 | |
| 
 | |
|       *(bb2 + 0) = a12;
 | |
|       *(bb2 + 1) = -a22;
 | |
|       *(bb2 + 2) = a32;
 | |
|       *(bb2 + 3) = 0.;
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 	aa1 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = -a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = -a41;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	bb1 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = 0.;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static __inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a12;
 | |
|   FLOAT a21, a22;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda + 2;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 1 * m;
 | |
|     b1 += 2 * m + 2;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 1 * m;
 | |
|     b2 += 2 * m + 2;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
| 
 | |
|       a22 = *(aa2 + 1);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|       *(bb2 + 0) = a21;
 | |
|       *(bb2 + 1) = a22;
 | |
|       aa1 += 2;
 | |
|       aa2 += 2;
 | |
|       bb1 += 2;
 | |
|       bb2 += 2;
 | |
| 
 | |
|       cc1 += 2 * m;
 | |
|       cc2 += 2 * m;
 | |
| 
 | |
|       is = ((m - js - 2) >> 1);
 | |
| 
 | |
|       while (is > 0){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	aa1 += 2;
 | |
| 	aa2 += 2;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a12;
 | |
| 	*(cc2 + 0) = a21;
 | |
| 	*(cc2 + 1) = a22;
 | |
| 
 | |
| 	bb1 += 2;
 | |
| 	bb2 += 2;
 | |
| 
 | |
| 	cc1 += 2 * m;
 | |
| 	cc2 += 2 * m;
 | |
| 
 | |
| 	is --;
 | |
|       }
 | |
| 
 | |
|       is = ((m - js - 2) & 1);
 | |
| 
 | |
|       if (is == 1){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a12;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|     }
 | |
| 
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a12;
 | |
|   FLOAT a21, a22;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda + 2;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 1 * m;
 | |
|     b1 += 2 * m + 2;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 1 * m;
 | |
|     b2 += 2 * m + 2;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
| 
 | |
|       a22 = *(aa2 + 1);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|       *(bb2 + 0) = a21;
 | |
|       *(bb2 + 1) = a22;
 | |
|       aa1 += 2;
 | |
|       aa2 += 2;
 | |
|       bb1 += 2;
 | |
|       bb2 += 2;
 | |
| 
 | |
|       cc1 += 2 * m;
 | |
|       cc2 += 2 * m;
 | |
| 
 | |
|       is = ((m - js - 2) >> 1);
 | |
| 
 | |
|       while (is > 0){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	aa1 += 2;
 | |
| 	aa2 += 2;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a12;
 | |
| 	*(cc2 + 0) = a21;
 | |
| 	*(cc2 + 1) = a22;
 | |
| 
 | |
| 	bb1 += 2;
 | |
| 	bb2 += 2;
 | |
| 
 | |
| 	cc1 += 2 * m;
 | |
| 	cc2 += 2 * m;
 | |
| 
 | |
| 	is --;
 | |
|       }
 | |
| 
 | |
|       is = ((m - js - 2) & 1);
 | |
| 
 | |
|       if (is == 1){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a12;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|     }
 | |
| 
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a12;
 | |
|   FLOAT a21, a22;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 1 * m;
 | |
|     b1 += 2 * m;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 1 * m;
 | |
|     b2 += 2;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	aa1 += 2;
 | |
| 	aa2 += 2;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a12;
 | |
| 	*(cc2 + 0) = a21;
 | |
| 	*(cc2 + 1) = a22;
 | |
| 
 | |
| 	bb1 += 2;
 | |
| 	bb2 += 2;
 | |
| 
 | |
| 	cc1 += 2 * m;
 | |
| 	cc2 += 2 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
| 
 | |
|       a12 = *(aa2 + 0);
 | |
|       a22 = *(aa2 + 1);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a12;
 | |
|       *(bb2 + 0) = a12;
 | |
|       *(bb2 + 1) = a22;
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	aa1 += 2;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc2 + 0) = a21;
 | |
| 	bb1 += 2;
 | |
| 
 | |
| 	cc1 += 2 * m;
 | |
| 	cc2 += 2 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a12;
 | |
|   FLOAT a21, a22;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 1 * m;
 | |
|     b1 += 2 * m;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 1 * m;
 | |
|     b2 += 2;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	aa1 += 2;
 | |
| 	aa2 += 2;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a12;
 | |
| 	*(cc2 + 0) = a21;
 | |
| 	*(cc2 + 1) = a22;
 | |
| 
 | |
| 	bb1 += 2;
 | |
| 	bb2 += 2;
 | |
| 
 | |
| 	cc1 += 2 * m;
 | |
| 	cc2 += 2 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
| 
 | |
|       a12 = *(aa2 + 0);
 | |
|       a22 = *(aa2 + 1);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a12;
 | |
|       *(bb2 + 0) = a12;
 | |
|       *(bb2 + 1) = a22;
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	aa1 += 2;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc2 + 0) = a21;
 | |
| 	bb1 += 2;
 | |
| 
 | |
| 	cc1 += 2 * m;
 | |
| 	cc2 += 2 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       *(bb1 + 0) = a11;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda + 4;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m + 4;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4 * m + 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
|       a31 = *(aa1 + 2);
 | |
|       a41 = *(aa1 + 3);
 | |
| 
 | |
|       a12 = *(aa2 + 2);
 | |
|       a22 = *(aa2 + 3);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|       *(bb1 + 2) = a31;
 | |
|       *(bb1 + 3) = a41;
 | |
| 
 | |
|       *(bb2 + 0) = a31;
 | |
|       *(bb2 + 1) = a41;
 | |
|       *(bb2 + 2) = a12;
 | |
|       *(bb2 + 3) = a22;
 | |
| 
 | |
|       aa1 += 4;
 | |
|       aa2 += 4;
 | |
|       bb1 += 4;
 | |
|       bb2 += 4;
 | |
| 
 | |
|       cc1 += 4 * m;
 | |
|       cc2 += 4 * m;
 | |
| 
 | |
|       is = ((m - js - 2) >> 1);
 | |
| 
 | |
|       while (is > 0){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
| 
 | |
| 	is --;
 | |
|       }
 | |
| 
 | |
|       if (m & 1){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|     }
 | |
| 
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda + 4;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m + 4;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4 * m + 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
|       a31 = *(aa1 + 2);
 | |
|       a41 = *(aa1 + 3);
 | |
| 
 | |
|       a12 = *(aa2 + 2);
 | |
|       a22 = *(aa2 + 3);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|       *(bb1 + 2) = a31;
 | |
|       *(bb1 + 3) = a41;
 | |
| 
 | |
|       *(bb2 + 0) = a31;
 | |
|       *(bb2 + 1) = a41;
 | |
|       *(bb2 + 2) = a12;
 | |
|       *(bb2 + 3) = a22;
 | |
| 
 | |
|       aa1 += 4;
 | |
|       aa2 += 4;
 | |
|       bb1 += 4;
 | |
|       bb2 += 4;
 | |
| 
 | |
|       cc1 += 4 * m;
 | |
|       cc2 += 4 * m;
 | |
| 
 | |
|       is = ((m - js - 2) >> 1);
 | |
| 
 | |
|       while (is > 0){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
| 
 | |
| 	is --;
 | |
|       }
 | |
| 
 | |
|       if (m & 1){
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|     }
 | |
| 
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
| 
 | |
|       a12 = *(aa2 + 0);
 | |
|       a22 = *(aa2 + 1);
 | |
|       a32 = *(aa2 + 2);
 | |
|       a42 = *(aa2 + 3);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|       *(bb1 + 2) = a12;
 | |
|       *(bb1 + 3) = a22;
 | |
| 
 | |
|       *(bb2 + 0) = a12;
 | |
|       *(bb2 + 1) = a22;
 | |
|       *(bb2 + 2) = a32;
 | |
|       *(bb2 + 3) = a42;
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 	aa1 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	bb1 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| static __inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
 | |
|   BLASLONG is, js;
 | |
| 
 | |
|   FLOAT *aa1, *aa2;
 | |
|   FLOAT *b1, *b2;
 | |
|   FLOAT *bb1, *bb2;
 | |
|   FLOAT *cc1, *cc2;
 | |
|   FLOAT a11, a21, a31, a41;
 | |
|   FLOAT a12, a22, a32, a42;
 | |
| 
 | |
|   b1 = b;
 | |
|   b2 = b;
 | |
| 
 | |
|   lda *= 2;
 | |
| 
 | |
|   for (js = 0; js < m; js += 2){
 | |
| 
 | |
|     aa1 = a + 0 * lda;
 | |
|     aa2 = a + 1 * lda;
 | |
|     a  += 2 * lda;
 | |
| 
 | |
|     bb1 = b1 + 0 * m;
 | |
|     bb2 = b1 + 2 * m;
 | |
|     b1 += 4 * m;
 | |
| 
 | |
|     cc1 = b2 + 0 * m;
 | |
|     cc2 = b2 + 2 * m;
 | |
|     b2 += 4;
 | |
| 
 | |
|     if (m - js >= 2){
 | |
| 
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 
 | |
| 	a12 = *(aa2 + 0);
 | |
| 	a22 = *(aa2 + 1);
 | |
| 	a32 = *(aa2 + 2);
 | |
| 	a42 = *(aa2 + 3);
 | |
| 
 | |
| 	aa1 += 4;
 | |
| 	aa2 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(bb2 + 0) = a12;
 | |
| 	*(bb2 + 1) = a22;
 | |
| 	*(bb2 + 2) = a32;
 | |
| 	*(bb2 + 3) = a42;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc1 + 2) = a12;
 | |
| 	*(cc1 + 3) = a22;
 | |
| 
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	*(cc2 + 2) = a32;
 | |
| 	*(cc2 + 3) = a42;
 | |
| 
 | |
| 	bb1 += 4;
 | |
| 	bb2 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
| 
 | |
|       a12 = *(aa2 + 0);
 | |
|       a22 = *(aa2 + 1);
 | |
|       a32 = *(aa2 + 2);
 | |
|       a42 = *(aa2 + 3);
 | |
| 
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|       *(bb1 + 2) = a12;
 | |
|       *(bb1 + 3) = a22;
 | |
| 
 | |
|       *(bb2 + 0) = a12;
 | |
|       *(bb2 + 1) = a22;
 | |
|       *(bb2 + 2) = a32;
 | |
|       *(bb2 + 3) = a42;
 | |
|     }
 | |
| 
 | |
|     if (m - js == 1){
 | |
|       for (is = 0; is < js; is += 2){
 | |
| 
 | |
| 	a11 = *(aa1 + 0);
 | |
| 	a21 = *(aa1 + 1);
 | |
| 	a31 = *(aa1 + 2);
 | |
| 	a41 = *(aa1 + 3);
 | |
| 	aa1 += 4;
 | |
| 
 | |
| 	*(bb1 + 0) = a11;
 | |
| 	*(bb1 + 1) = a21;
 | |
| 	*(bb1 + 2) = a31;
 | |
| 	*(bb1 + 3) = a41;
 | |
| 
 | |
| 	*(cc1 + 0) = a11;
 | |
| 	*(cc1 + 1) = a21;
 | |
| 	*(cc2 + 0) = a31;
 | |
| 	*(cc2 + 1) = a41;
 | |
| 	bb1 += 4;
 | |
| 
 | |
| 	cc1 += 4 * m;
 | |
| 	cc2 += 4 * m;
 | |
|       }
 | |
| 
 | |
|       a11 = *(aa1 + 0);
 | |
|       a21 = *(aa1 + 1);
 | |
|       *(bb1 + 0) = a11;
 | |
|       *(bb1 + 1) = a21;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| #endif
 | |
| #endif
 | |
| 
 |