786 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			C
		
	
	
	
			
		
		
	
	
			786 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			C
		
	
	
	
| /*********************************************************************/
 | |
| /* Copyright 2009, 2010 The University of Texas at Austin.           */
 | |
| /* All rights reserved.                                              */
 | |
| /*                                                                   */
 | |
| /* Redistribution and use in source and binary forms, with or        */
 | |
| /* without modification, are permitted provided that the following   */
 | |
| /* conditions are met:                                               */
 | |
| /*                                                                   */
 | |
| /*   1. Redistributions of source code must retain the above         */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer.                                                  */
 | |
| /*                                                                   */
 | |
| /*   2. Redistributions in binary form must reproduce the above      */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer in the documentation and/or other materials       */
 | |
| /*      provided with the distribution.                              */
 | |
| /*                                                                   */
 | |
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 | |
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 | |
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 | |
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 | |
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 | |
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 | |
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 | |
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 | |
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 | |
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 | |
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 | |
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 | |
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 | |
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 | |
| /*                                                                   */
 | |
| /* The views and conclusions contained in the software and           */
 | |
| /* documentation are those of the authors and should not be          */
 | |
| /* interpreted as representing official policies, either expressed   */
 | |
| /* or implied, of The University of Texas at Austin.                 */
 | |
| /*********************************************************************/
 | |
| 
 | |
| #include <stdio.h>
 | |
| #include "common.h"
 | |
| 
 | |
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
 | |
| 
 | |
| 	BLASLONG i, js;
 | |
| 	BLASLONG X, mm;
 | |
| 
 | |
| 	FLOAT data01, data02, data03, data04, data05, data06;
 | |
| 	FLOAT data07, data08, data09, data10, data11, data12;
 | |
| 	FLOAT data13, data14, data15, data16, data17, data18;
 | |
| 	FLOAT data19, data20, data21, data22, data23, data24;
 | |
| 	FLOAT data25, data26, data27, data28, data29, data30;
 | |
| 	FLOAT data31, data32, data33, data34, data35, data36;
 | |
| 
 | |
| 	FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6;
 | |
| 
 | |
| 	//js = (n >> 2);
 | |
| 	js = n/6;
 | |
| 	if (js > 0){
 | |
| 		do {
 | |
| 			X = posX;
 | |
| 
 | |
| 			if (posX <= posY) {
 | |
| 				ao1 = a + posX + (posY + 0) * lda;
 | |
| 				ao2 = a + posX + (posY + 1) * lda;
 | |
| 				ao3 = a + posX + (posY + 2) * lda;
 | |
| 				ao4 = a + posX + (posY + 3) * lda;
 | |
| 				ao5 = a + posX + (posY + 4) * lda;
 | |
| 				ao6 = a + posX + (posY + 5) * lda;
 | |
| 			} else {
 | |
| 				ao1 = a + posY + (posX + 0) * lda;
 | |
| 				ao2 = a + posY + (posX + 1) * lda;
 | |
| 				ao3 = a + posY + (posX + 2) * lda;
 | |
| 				ao4 = a + posY + (posX + 3) * lda;
 | |
| 				ao5 = a + posY + (posX + 4) * lda;
 | |
| 				ao6 = a + posY + (posX + 5) * lda;
 | |
| 			}
 | |
| 
 | |
| 			i = m/6;
 | |
| 			if (i > 0) {
 | |
| 				do {
 | |
| 					if (X < posY) {
 | |
| 						data01 = *(ao1 + 0);
 | |
| 						data02 = *(ao1 + 1);
 | |
| 						data03 = *(ao1 + 2);
 | |
| 						data04 = *(ao1 + 3);
 | |
| 						data05 = *(ao1 + 4);
 | |
| 						data06 = *(ao1 + 5);
 | |
| 
 | |
| 						data07 = *(ao2 + 0);
 | |
| 						data08 = *(ao2 + 1);
 | |
| 						data09 = *(ao2 + 2);
 | |
| 						data10 = *(ao2 + 3);
 | |
| 						data11 = *(ao2 + 4);
 | |
| 						data12 = *(ao2 + 5);
 | |
| 
 | |
| 						data13 = *(ao3 + 0);
 | |
| 						data14 = *(ao3 + 1);
 | |
| 						data15 = *(ao3 + 2);
 | |
| 						data16 = *(ao3 + 3);
 | |
| 						data17 = *(ao3 + 4);
 | |
| 						data18 = *(ao3 + 5);
 | |
| 
 | |
| 						data19 = *(ao4 + 0);
 | |
| 						data20 = *(ao4 + 1);
 | |
| 						data21 = *(ao4 + 2);
 | |
| 						data22 = *(ao4 + 3);
 | |
| 						data23 = *(ao4 + 4);
 | |
| 						data24 = *(ao4 + 5);
 | |
| 
 | |
| 						data25 = *(ao5 + 0);
 | |
| 						data26 = *(ao5 + 1);
 | |
| 						data27 = *(ao5 + 2);
 | |
| 						data28 = *(ao5 + 3);
 | |
| 						data29 = *(ao5 + 4);
 | |
| 						data30 = *(ao5 + 5);
 | |
| 
 | |
| 						data31 = *(ao6 + 0);
 | |
| 						data32 = *(ao6 + 1);
 | |
| 						data33 = *(ao6 + 2);
 | |
| 						data34 = *(ao6 + 3);
 | |
| 						data35 = *(ao6 + 4);
 | |
| 						data36 = *(ao6 + 5);
 | |
| 
 | |
| 						b[ 0] = data01;
 | |
| 						b[ 1] = data07;
 | |
| 						b[ 2] = data13;
 | |
| 						b[ 3] = data19;
 | |
| 						b[ 4] = data25;
 | |
| 						b[ 5] = data31;
 | |
| 
 | |
| 						b[ 6] = data02;
 | |
| 						b[ 7] = data08;
 | |
| 						b[ 8] = data14;
 | |
| 						b[ 9] = data20;
 | |
| 						b[10] = data26;
 | |
| 						b[11] = data32;
 | |
| 
 | |
| 						b[12] = data03;
 | |
| 						b[13] = data09;
 | |
| 						b[14] = data15;
 | |
| 						b[15] = data21;
 | |
| 						b[16] = data27;
 | |
| 						b[17] = data33;
 | |
| 
 | |
| 						b[18] = data04;
 | |
| 						b[19] = data10;
 | |
| 						b[20] = data16;
 | |
| 						b[21] = data22;
 | |
| 						b[22] = data28;
 | |
| 						b[23] = data34;
 | |
| 
 | |
| 						b[24] = data05;
 | |
| 						b[25] = data11;
 | |
| 						b[26] = data17;
 | |
| 						b[27] = data23;
 | |
| 						b[28] = data29;
 | |
| 						b[29] = data35;
 | |
| 
 | |
| 						b[30] = data06;
 | |
| 						b[31] = data12;
 | |
| 						b[32] = data18;
 | |
| 						b[33] = data24;
 | |
| 						b[34] = data30;
 | |
| 						b[35] = data36;
 | |
| 
 | |
| 						ao1 += 6;
 | |
| 						ao2 += 6;
 | |
| 						ao3 += 6;
 | |
| 						ao4 += 6;
 | |
| 						ao5 += 6;
 | |
| 						ao6 += 6;
 | |
| 						b += 36;
 | |
| 					} else
 | |
| 						if (X > posY) {
 | |
| 							b[ 0] = ZERO;
 | |
| 							b[ 1] = ZERO;
 | |
| 							b[ 2] = ZERO;
 | |
| 							b[ 3] = ZERO;
 | |
| 							b[ 4] = ZERO;
 | |
| 							b[ 5] = ZERO;
 | |
| 							b[ 6] = ZERO;
 | |
| 							b[ 7] = ZERO;
 | |
| 							b[ 8] = ZERO;
 | |
| 							b[ 9] = ZERO;
 | |
| 							b[10] = ZERO;
 | |
| 							b[11] = ZERO;
 | |
| 							b[12] = ZERO;
 | |
| 							b[13] = ZERO;
 | |
| 							b[14] = ZERO;
 | |
| 							b[15] = ZERO;
 | |
| 							b[16] = ZERO;
 | |
| 							b[17] = ZERO;
 | |
| 							b[18] = ZERO;
 | |
| 							b[19] = ZERO;
 | |
| 							b[20] = ZERO;
 | |
| 							b[21] = ZERO;
 | |
| 							b[22] = ZERO;
 | |
| 							b[23] = ZERO;
 | |
| 							b[24] = ZERO;
 | |
| 							b[25] = ZERO;
 | |
| 							b[26] = ZERO;
 | |
| 							b[27] = ZERO;
 | |
| 							b[28] = ZERO;
 | |
| 							b[29] = ZERO;
 | |
| 							b[30] = ZERO;
 | |
| 							b[31] = ZERO;
 | |
| 							b[32] = ZERO;
 | |
| 							b[33] = ZERO;
 | |
| 							b[34] = ZERO;
 | |
| 							b[35] = ZERO;
 | |
| 
 | |
| 							ao1 += 6 * lda;
 | |
| 							ao2 += 6 * lda;
 | |
| 							ao3 += 6 * lda;
 | |
| 							ao4 += 6 * lda;
 | |
| 							ao5 += 6 * lda;
 | |
| 							ao6 += 6 * lda;
 | |
| 
 | |
| 							b   += 36;
 | |
| 						} else {
 | |
| 							data01 = *(ao1 + 0);
 | |
| 							data07 = *(ao2 + 0);
 | |
| 							data13 = *(ao3 + 0);
 | |
| 							data19 = *(ao4 + 0);
 | |
| 							data25 = *(ao5 + 0);
 | |
| 							data31 = *(ao6 + 0);
 | |
| 
 | |
| 							data08 = *(ao2 + 1);
 | |
| 							data14 = *(ao3 + 1);
 | |
| 							data20 = *(ao4 + 1);
 | |
| 							data26 = *(ao5 + 1);
 | |
| 							data32 = *(ao6 + 1);
 | |
| 
 | |
| 							data15 = *(ao3 + 2);
 | |
| 							data21 = *(ao4 + 2);
 | |
| 							data27 = *(ao5 + 2);
 | |
| 							data33 = *(ao6 + 2);
 | |
| 
 | |
| 							data22 = *(ao4 + 3);
 | |
| 							data28 = *(ao5 + 3);
 | |
| 							data34 = *(ao6 + 3);
 | |
| 
 | |
| 							data29 = *(ao5 + 4);
 | |
| 							data35 = *(ao6 + 4);
 | |
| 
 | |
| 							data36 = *(ao6 + 5);
 | |
| 
 | |
| #ifdef UNIT
 | |
| 							b[ 0] = ONE;
 | |
| 							b[ 1] = data07;
 | |
| 							b[ 2] = data13;
 | |
| 							b[ 3] = data19;
 | |
| 							b[ 4] = data25;
 | |
| 							b[ 5] = data31;
 | |
| 
 | |
| 							b[ 6] = ZERO;
 | |
| 							b[ 7] = ONE;
 | |
| 							b[ 8] = data14;
 | |
| 							b[ 9] = data20;
 | |
| 							b[10] = data26;
 | |
| 							b[11] = data32;
 | |
| 
 | |
| 							b[12] = ZERO;
 | |
| 							b[13] = ZERO;
 | |
| 							b[14] = ONE;
 | |
| 							b[15] = data21;
 | |
| 							b[16] = data27;
 | |
| 							b[17] = data33;
 | |
| 
 | |
| 							b[18] = ZERO;
 | |
| 							b[19] = ZERO;
 | |
| 							b[20] = ZERO;
 | |
| 							b[21] = ONE;
 | |
| 							b[22] = data28;
 | |
| 							b[23] = data34;
 | |
| 
 | |
| 							b[24] = ZERO;
 | |
| 							b[25] = ZERO;
 | |
| 							b[26] = ZERO;
 | |
| 							b[27] = ZERO;
 | |
| 							b[28] = ONE;
 | |
| 							b[29] = data35;
 | |
| 
 | |
| 							b[30] = ZERO;
 | |
| 							b[31] = ZERO;
 | |
| 							b[32] = ZERO;
 | |
| 							b[33] = ZERO;
 | |
| 							b[34] = ZERO;
 | |
| 							b[35] = ONE;
 | |
| #else
 | |
| 							b[ 0] = data01;
 | |
| 							b[ 1] = data07;
 | |
| 							b[ 2] = data13;
 | |
| 							b[ 3] = data19;
 | |
| 							b[ 4] = data25;
 | |
| 							b[ 5] = data31;
 | |
| 
 | |
| 							b[ 6] = ZERO;
 | |
| 							b[ 7] = data08;
 | |
| 							b[ 8] = data14;
 | |
| 							b[ 9] = data20;
 | |
| 							b[10] = data26;
 | |
| 							b[11] = data32;
 | |
| 
 | |
| 							b[12] = ZERO;
 | |
| 							b[13] = ZERO;
 | |
| 							b[14] = data15;
 | |
| 							b[15] = data21;
 | |
| 							b[16] = data27;
 | |
| 							b[17] = data33;
 | |
| 
 | |
| 							b[18] = ZERO;
 | |
| 							b[19] = ZERO;
 | |
| 							b[20] = ZERO;
 | |
| 							b[21] = data22;
 | |
| 							b[22] = data28;
 | |
| 							b[23] = data34;
 | |
| 
 | |
| 							b[24] = ZERO;
 | |
| 							b[25] = ZERO;
 | |
| 							b[26] = ZERO;
 | |
| 							b[27] = ZERO;
 | |
| 							b[28] = data29;
 | |
| 							b[29] = data35;
 | |
| 
 | |
| 							b[30] = ZERO;
 | |
| 							b[31] = ZERO;
 | |
| 							b[32] = ZERO;
 | |
| 							b[33] = ZERO;
 | |
| 							b[34] = ZERO;
 | |
| 							b[35] = data36;
 | |
| #endif
 | |
| 
 | |
| 							ao1 += 6;
 | |
| 							ao2 += 6;
 | |
| 							ao3 += 6;
 | |
| 							ao4 += 6;
 | |
| 							ao5 += 6;
 | |
| 							ao6 += 7;
 | |
| 
 | |
| 							b += 36;
 | |
| 						}
 | |
| 					X += 6;
 | |
| 					i --;
 | |
| 				} while (i > 0);
 | |
| 			}
 | |
| 			mm = m - m/6;
 | |
| 			if (mm & 4) {
 | |
| 				if (X < posY) {
 | |
| 					data01 = *(ao1 + 0);
 | |
| 					data02 = *(ao1 + 1);
 | |
| 					data03 = *(ao1 + 2);
 | |
| 					data04 = *(ao1 + 3);
 | |
| 
 | |
| 					data05 = *(ao2 + 0);
 | |
| 					data06 = *(ao2 + 1);
 | |
| 					data07 = *(ao2 + 2);
 | |
| 					data08 = *(ao2 + 3);
 | |
| 
 | |
| 					data09 = *(ao3 + 0);
 | |
| 					data10 = *(ao3 + 1);
 | |
| 					data11 = *(ao3 + 2);
 | |
| 					data12 = *(ao3 + 3);
 | |
| 
 | |
| 					data13 = *(ao4 + 0);
 | |
| 					data14 = *(ao4 + 1);
 | |
| 					data15 = *(ao4 + 2);
 | |
| 					data16 = *(ao4 + 3);
 | |
| 
 | |
| 					b[ 0] = data01;
 | |
| 					b[ 1] = data05;
 | |
| 					b[ 2] = data09;
 | |
| 					b[ 3] = data13;
 | |
| 					b[ 4] = data02;
 | |
| 					b[ 5] = data06;
 | |
| 					b[ 6] = data10;
 | |
| 					b[ 7] = data14;
 | |
| 
 | |
| 					b[ 8] = data03;
 | |
| 					b[ 9] = data07;
 | |
| 					b[10] = data11;
 | |
| 					b[11] = data15;
 | |
| 					b[12] = data04;
 | |
| 					b[13] = data08;
 | |
| 					b[14] = data12;
 | |
| 					b[15] = data16;
 | |
| 
 | |
| 					ao1 += 4;
 | |
| 					ao2 += 4;
 | |
| 					ao3 += 4;
 | |
| 					ao4 += 4;
 | |
| 					b += 16;
 | |
| 				} else
 | |
| 					if (X > posY) {
 | |
| 						b[ 0] = ZERO;
 | |
| 						b[ 1] = ZERO;
 | |
| 						b[ 2] = ZERO;
 | |
| 						b[ 3] = ZERO;
 | |
| 						b[ 4] = ZERO;
 | |
| 						b[ 5] = ZERO;
 | |
| 						b[ 6] = ZERO;
 | |
| 						b[ 7] = ZERO;
 | |
| 						b[ 8] = ZERO;
 | |
| 						b[ 9] = ZERO;
 | |
| 						b[10] = ZERO;
 | |
| 						b[11] = ZERO;
 | |
| 						b[12] = ZERO;
 | |
| 						b[13] = ZERO;
 | |
| 						b[14] = ZERO;
 | |
| 						b[15] = ZERO;
 | |
| 						b[16] = ZERO;
 | |
| 						b[17] = ZERO;
 | |
| 						b[18] = ZERO;
 | |
| 						b[19] = ZERO;
 | |
| 						b[20] = ZERO;
 | |
| 						b[21] = ZERO;
 | |
| 						b[22] = ZERO;
 | |
| 						b[23] = ZERO;
 | |
| 
 | |
| 						ao1 += 4 * lda;
 | |
| 						ao2 += 4 * lda;
 | |
| 						ao3 += 4 * lda;
 | |
| 						ao4 += 4 * lda;
 | |
| 
 | |
| 						b   += 16;
 | |
| 					} else {
 | |
| #ifdef UNIT
 | |
| 						data05 = *(ao2 + 0);
 | |
| 
 | |
| 						data09 = *(ao3 + 0);
 | |
| 						data10 = *(ao3 + 1);
 | |
| 
 | |
| 						data13 = *(ao4 + 0);
 | |
| 						data14 = *(ao4 + 1);
 | |
| 						data15 = *(ao4 + 2);
 | |
| 
 | |
| 						b[ 0] = ONE;
 | |
| 						b[ 1] = data05;
 | |
| 						b[ 2] = data09;
 | |
| 						b[ 3] = data13;
 | |
| 
 | |
| 						b[ 4] = ZERO;
 | |
| 						b[ 5] = ONE;
 | |
| 						b[ 6] = data10;
 | |
| 						b[ 7] = data14;
 | |
| 
 | |
| 						b[ 8] = ZERO;
 | |
| 						b[ 9] = ZERO;
 | |
| 						b[10] = ONE;
 | |
| 						b[11] = data15;
 | |
| 
 | |
| 						b[12] = ZERO;
 | |
| 						b[13] = ZERO;
 | |
| 						b[14] = ZERO;
 | |
| 						b[15] = ONE;
 | |
| #else
 | |
| 						data01 = *(ao1 + 0);
 | |
| 
 | |
| 						data05 = *(ao2 + 0);
 | |
| 						data06 = *(ao2 + 1);
 | |
| 
 | |
| 						data09 = *(ao3 + 0);
 | |
| 						data10 = *(ao3 + 1);
 | |
| 						data11 = *(ao3 + 2);
 | |
| 
 | |
| 						data13 = *(ao4 + 0);
 | |
| 						data14 = *(ao4 + 1);
 | |
| 						data15 = *(ao4 + 2);
 | |
| 						data16 = *(ao4 + 3);
 | |
| 
 | |
| 						b[ 0] = data01;
 | |
| 						b[ 1] = data05;
 | |
| 						b[ 2] = data09;
 | |
| 						b[ 3] = data13;
 | |
| 
 | |
| 						b[ 4] = ZERO;
 | |
| 						b[ 5] = data06;
 | |
| 						b[ 6] = data10;
 | |
| 						b[ 7] = data14;
 | |
| 
 | |
| 						b[ 8] = ZERO;
 | |
| 						b[ 9] = ZERO;
 | |
| 						b[10] = data11;
 | |
| 						b[11] = data15;
 | |
| 
 | |
| 						b[12] = ZERO;
 | |
| 						b[13] = ZERO;
 | |
| 						b[14] = ZERO;
 | |
| 						b[15] = data16;
 | |
| #endif
 | |
| 						ao1 += 4;
 | |
| 						ao2 += 4;
 | |
| 						ao3 += 4;
 | |
| 						ao4 += 4;
 | |
| 
 | |
| 						b += 16;
 | |
| 					}
 | |
| 				X += 4;
 | |
| 			}
 | |
| 
 | |
| 			if (mm & 3) {
 | |
| 				if (X < posY) {
 | |
| 					if (mm & 2) {
 | |
| 						data01 = *(ao1 + 0);
 | |
| 						data02 = *(ao1 + 1);
 | |
| 						data03 = *(ao2 + 0);
 | |
| 						data04 = *(ao2 + 1);
 | |
| 						data05 = *(ao3 + 0);
 | |
| 						data06 = *(ao3 + 1);
 | |
| 						data07 = *(ao4 + 0);
 | |
| 						data08 = *(ao4 + 1);
 | |
| 
 | |
| 						b[ 0] = data01;
 | |
| 						b[ 1] = data03;
 | |
| 						b[ 2] = data05;
 | |
| 						b[ 3] = data07;
 | |
| 						b[ 4] = data02;
 | |
| 						b[ 5] = data04;
 | |
| 						b[ 6] = data06;
 | |
| 						b[ 7] = data08;
 | |
| 
 | |
| 						ao1 += 2;
 | |
| 						ao2 += 2;
 | |
| 						ao3 += 2;
 | |
| 						ao4 += 2;
 | |
| 						b += 8;
 | |
| 					}
 | |
| 
 | |
| 					if (mm & 1) {
 | |
| 						data01 = *(ao1 + 0);
 | |
| 						data03 = *(ao2 + 0);
 | |
| 						data05 = *(ao3 + 0);
 | |
| 						data07 = *(ao4 + 0);
 | |
| 
 | |
| 						b[ 0] = data01;
 | |
| 						b[ 1] = data03;
 | |
| 						b[ 2] = data05;
 | |
| 						b[ 3] = data07;
 | |
| 
 | |
| 						ao1 += 1;
 | |
| 						ao2 += 1;
 | |
| 						ao3 += 1;
 | |
| 						ao4 += 1;
 | |
| 						b += 4;
 | |
| 					}
 | |
| 
 | |
| 				} else
 | |
| 					if (X > posY) {
 | |
| 						if (m & 2) {
 | |
| 							ao1 += 2 * lda;
 | |
| 							ao2 += 2 * lda;
 | |
| 							b   += 8;
 | |
| 						}
 | |
| 
 | |
| 						if (m & 1) {
 | |
| 							ao1 += lda;
 | |
| 							b += 4;
 | |
| 						}
 | |
| 
 | |
| 					} else {
 | |
| #ifdef UNIT
 | |
| 						data05 = *(ao2 + 0);
 | |
| 						data09 = *(ao3 + 0);
 | |
| 						data13 = *(ao4 + 0);
 | |
| 
 | |
| 						if (i >= 2) {
 | |
| 							data10 = *(ao3 + 1);
 | |
| 							data14 = *(ao4 + 1);
 | |
| 						}
 | |
| 
 | |
| 						if (i >= 3) {
 | |
| 							data15 = *(ao4 + 2);
 | |
| 						}
 | |
| 
 | |
| 						b[ 0] = ONE;
 | |
| 						b[ 1] = data05;
 | |
| 						b[ 2] = data09;
 | |
| 						b[ 3] = data13;
 | |
| 						b += 4;
 | |
| 
 | |
| 						if(i >= 2) {
 | |
| 							b[ 0] = ZERO;
 | |
| 							b[ 1] = ONE;
 | |
| 							b[ 2] = data10;
 | |
| 							b[ 3] = data14;
 | |
| 							b += 4;
 | |
| 						}
 | |
| 
 | |
| 						if (i >= 3) {
 | |
| 							b[ 0] = ZERO;
 | |
| 							b[ 1] = ZERO;
 | |
| 							b[ 2] = ONE;
 | |
| 							b[ 3] = data15;
 | |
| 							b += 4;
 | |
| 						}
 | |
| #else
 | |
| 						data01 = *(ao1 + 0);
 | |
| 						data05 = *(ao2 + 0);
 | |
| 						data09 = *(ao3 + 0);
 | |
| 						data13 = *(ao4 + 0);
 | |
| 
 | |
| 						if (i >= 2) {
 | |
| 							data06 = *(ao2 + 1);
 | |
| 							data10 = *(ao3 + 1);
 | |
| 							data14 = *(ao4 + 1);
 | |
| 						}
 | |
| 
 | |
| 						if (i >= 3) {
 | |
| 							data11 = *(ao3 + 2);
 | |
| 							data15 = *(ao4 + 2);
 | |
| 						}
 | |
| 
 | |
| 						b[ 0] = data01;
 | |
| 						b[ 1] = data05;
 | |
| 						b[ 2] = data09;
 | |
| 						b[ 3] = data13;
 | |
| 						b += 4;
 | |
| 
 | |
| 						if(i >= 2) {
 | |
| 							b[ 0] = ZERO;
 | |
| 							b[ 1] = data06;
 | |
| 							b[ 2] = data10;
 | |
| 							b[ 3] = data14;
 | |
| 							b += 4;
 | |
| 						}
 | |
| 
 | |
| 						if (i >= 3) {
 | |
| 							b[ 0] = ZERO;
 | |
| 							b[ 1] = ZERO;
 | |
| 							b[ 2] = data11;
 | |
| 							b[ 3] = data15;
 | |
| 							b += 4;
 | |
| 						}
 | |
| #endif
 | |
| 					}
 | |
| 			}
 | |
| 
 | |
| 			posY += 4;
 | |
| 			js --;
 | |
| 		} while (js > 0);
 | |
| 	} /* End of main loop */
 | |
| 
 | |
| 	if (n & 2){
 | |
| 		X = posX;
 | |
| 
 | |
| 		if (posX <= posY) {
 | |
| 			ao1 = a + posX + (posY + 0) * lda;
 | |
| 			ao2 = a + posX + (posY + 1) * lda;
 | |
| 		} else {
 | |
| 			ao1 = a + posY + (posX + 0) * lda;
 | |
| 			ao2 = a + posY + (posX + 1) * lda;
 | |
| 		}
 | |
| 
 | |
| 		i = (m >> 1);
 | |
| 		if (i > 0) {
 | |
| 			do {
 | |
| 				if (X < posY) {
 | |
| 					data01 = *(ao1 + 0);
 | |
| 					data02 = *(ao1 + 1);
 | |
| 					data05 = *(ao2 + 0);
 | |
| 					data06 = *(ao2 + 1);
 | |
| 
 | |
| 					b[ 0] = data01;
 | |
| 					b[ 1] = data05;
 | |
| 					b[ 2] = data02;
 | |
| 					b[ 3] = data06;
 | |
| 
 | |
| 					ao1 += 2;
 | |
| 					ao2 += 2;
 | |
| 					b += 4;
 | |
| 
 | |
| 				} else
 | |
| 					if (X > posY) {
 | |
| 						ao1 += 2 * lda;
 | |
| 						ao2 += 2 * lda;
 | |
| 						b += 4;
 | |
| 
 | |
| 					} else {
 | |
| #ifdef UNIT
 | |
| 						data05 = *(ao2 + 0);
 | |
| 
 | |
| 						b[ 0] = ONE;
 | |
| 						b[ 1] = data05;
 | |
| 						b[ 2] = ZERO;
 | |
| 						b[ 3] = ONE;
 | |
| #else
 | |
| 						data01 = *(ao1 + 0);
 | |
| 						data05 = *(ao2 + 0);
 | |
| 						data06 = *(ao2 + 1);
 | |
| 
 | |
| 						b[ 0] = data01;
 | |
| 						b[ 1] = data05;
 | |
| 						b[ 2] = ZERO;
 | |
| 						b[ 3] = data06;
 | |
| #endif
 | |
| 
 | |
| 						ao1 += 2 * lda;
 | |
| 						ao2 += 2 * lda;
 | |
| 
 | |
| 						b += 4;
 | |
| 					}
 | |
| 
 | |
| 				X += 2;
 | |
| 				i --;
 | |
| 			} while (i > 0);
 | |
| 		}
 | |
| 
 | |
| 		i = (m & 1);
 | |
| 		if (i) {
 | |
| 
 | |
| 			if (X < posY) {
 | |
| 				data01 = *(ao1 + 0);
 | |
| 				data05 = *(ao2 + 0);
 | |
| 
 | |
| 				b[ 0] = data01;
 | |
| 				b[ 1] = data05;
 | |
| 				ao1 += 1;
 | |
| 				ao2 += 1;
 | |
| 				b += 2;
 | |
| 			} else
 | |
| 				if (X > posY) {
 | |
| 					ao1 += lda;
 | |
| 					ao2 += lda;
 | |
| 					b += 2;
 | |
| 				} else {
 | |
| #ifdef UNIT
 | |
| 					data05 = *(ao2 + 0);
 | |
| 					b[ 0] = ONE;
 | |
| 					b[ 1] = data05;
 | |
| #else
 | |
| 					data01 = *(ao1 + 0);
 | |
| 					data05 = *(ao2 + 0);
 | |
| 
 | |
| 					b[ 0] = data01;
 | |
| 					b[ 1] = data05;
 | |
| #endif
 | |
| 					ao1 += lda;
 | |
| 					ao2 += lda;
 | |
| 					b += 2;
 | |
| 				}
 | |
| 		}
 | |
| 
 | |
| 		posY += 2;
 | |
| 	}
 | |
| 
 | |
| 	if (n & 1){
 | |
| 		X = posX;
 | |
| 
 | |
| 		if (posX <= posY) {
 | |
| 			ao1 = a + posX + (posY + 0) * lda;
 | |
| 		} else {
 | |
| 			ao1 = a + posY + (posX + 0) * lda;
 | |
| 		}
 | |
| 
 | |
| 		i = m;
 | |
| 		if (m > 0) {
 | |
| 			do {
 | |
| 				if (X < posY) {
 | |
| 					data01 = *(ao1 + 0);
 | |
| 					b[ 0] = data01;
 | |
| 					ao1 += 1;
 | |
| 					b += 1;
 | |
| 				} else
 | |
| 					if (X > posY)  {
 | |
| 						ao1 += lda;
 | |
| 						b += 1;
 | |
| 					} else {
 | |
| #ifdef UNIT
 | |
| 						b[ 0] = ONE;
 | |
| #else
 | |
| 						data01 = *(ao1 + 0);
 | |
| 						b[ 0] = data01;
 | |
| #endif
 | |
| 						ao1 += lda;
 | |
| 						b += 1;
 | |
| 					}
 | |
| 
 | |
| 				X += 1;
 | |
| 				i --;
 | |
| 			} while (i > 0);
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return 0;
 | |
| }
 |