Merge branch 'loongson3a' into release-0.1.0

This commit is contained in:
Xianyi Zhang
2012-03-23 01:26:27 +08:00
24 changed files with 15298 additions and 32 deletions

View File

@@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S
endif
ifndef CTRSMKERNEL_LN
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef CTRSMKERNEL_LT
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RN
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RT
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef ZTRSMKERNEL_LN
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_LT
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RN
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RT
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
CGEMM3MKERNEL = zgemm3m_kernel.S
ZGEMM3MKERNEL = zgemm3m_kernel.S

View File

@@ -1,18 +1,48 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S
SGEMMKERNEL = sgemm_kernel_loongson3a.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMVNKERNEL = gemv_n_loongson3a.c
SGEMVTKERNEL = gemv_t_loongson3a.c
DGEMVNKERNEL = gemv_n_loongson3a.c
DGEMVTKERNEL = gemv_t_loongson3a.c
CGEMVNKERNEL = zgemv_n_loongson3a.c
CGEMVTKERNEL = zgemv_t_loongson3a.c
ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
SGEMMKERNEL = sgemm_kernel_8x4_ps.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = gemm_kernel_loongson3a.S
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,101 @@
#include "common.h"
//These are auto-tuning codes on Loongson-3A platform.
//#define prefetch(x) __builtin_prefetch(x)
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
{
BLASLONG kx=0, ky=0;
if(!ALPHA)
return 0;
//if(INCX < 0)
// kx = (1-N) * INCX;
// INCX = -INCX;
//if(INCY < 0)
// ky = (1-M) * INCY;
// INCY = -INCY;
BLASLONG fahead = 30;
BLASLONG spec_unroll = 4;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0;
if(ALPHA == 1) {
if(INCY == 1) {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[i + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M);) {
spec_loop_alpha1;
}
}
} else {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0, h = ky;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[h + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M);) {
norm_loop_alpha1;
}
}
}
} else {
if(INCY == 1) {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[i + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M);) {
spec_loop;
}
}
} else {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0, h = ky;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[h + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M);) {
norm_loop;
}
}
}
}
return 0;
}

View File

@@ -0,0 +1,93 @@
#include "common.h"
//These are auto-tuning codes on Loongson-3A platform.
//#define prefetch(x) __builtin_prefetch(x)
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0)
#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0)
#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
if(!ALPHA)
return 0;
// if(INCX < 0)
// INCX = -INCX;
// if(INCY < 0)
// INCY = -INCY;
BLASLONG fahead = 30;
BLASLONG spec_unroll = 3;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0;
if(ALPHA == 1) {
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[i + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M);) {
spec_loop_alpha1;
}
}
} else {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0, h = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[h + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M);) {
norm_loop_alpha1;
}
}
}
} else {
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[i + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M);) {
spec_loop;
}
}
} else {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0, h = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[h + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M);) {
norm_loop;
}
}
}
}
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,139 @@
#include "common.h"
//typedef int BLASLONG;
//typedef double FLOAT;
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#if !defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_0
#define spec_loop spec_loop_0
#define norm_loop_alpha1 norm_loop_alpha1_0
#define norm_loop norm_loop_0
#endif
#if defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_1
#define spec_loop spec_loop_1
#define norm_loop_alpha1 norm_loop_alpha1_1
#define norm_loop norm_loop_1
#endif
#if !defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_2
#define spec_loop spec_loop_2
#define norm_loop_alpha1 norm_loop_alpha1_2
#define norm_loop norm_loop_2
#endif
#if defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_3
#define spec_loop spec_loop_3
#define norm_loop_alpha1 norm_loop_alpha1_3
#define norm_loop norm_loop_3
#endif
#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
if(!rALPHA && iALPHA)
return 0;
BLASLONG fahead = 60;
BLASLONG spec_unroll = 2;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0, jj = 0;
if(rALPHA == 1 && iALPHA == 0) {
if(INCY == 1) {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[ii + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M); i++) {
spec_loop_alpha1;
}
}
} else {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[iii + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M); i++) {
norm_loop_alpha1;
}
}
}
} else {
FLOAT rTmp, iTmp;
if(INCY == 1) {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[ii + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M); i++) {
spec_loop;
}
}
} else {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[iii + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M); i++) {
norm_loop;
}
}
}
}
return 0;
}

View File

@@ -0,0 +1,125 @@
#include "common.h"
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#if !defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_0
#define spec_loop spec_loop_0
#define norm_loop_alpha1 norm_loop_alpha1_0
#define norm_loop norm_loop_0
#endif
#if defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_1
#define spec_loop spec_loop_1
#define norm_loop_alpha1 norm_loop_alpha1_1
#define norm_loop norm_loop_1
#endif
#if !defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_2
#define spec_loop spec_loop_2
#define norm_loop_alpha1 norm_loop_alpha1_2
#define norm_loop norm_loop_2
#endif
#if defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_3
#define spec_loop spec_loop_3
#define norm_loop_alpha1 norm_loop_alpha1_3
#define norm_loop norm_loop_3
#endif
#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
if(!rALPHA && iALPHA)
return 0;
BLASLONG fahead = 30;
BLASLONG spec_unroll = 2;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0, jj = 0;
if(rALPHA == 1 && iALPHA == 0) {
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[ii + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M); i++) {
spec_loop_alpha1;
}
}
} else {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[iii + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M); i++) {
norm_loop_alpha1;
}
}
}
} else {
FLOAT rTmp, iTmp;
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[ii + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M); i++) {
spec_loop;
}
}
} else {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[iii + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M); i++) {
norm_loop;
}
}
}
}
return 0;
}