From 6bd7c54af5ecc2004b8a6df0157fe72d55530927 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Mon, 11 Dec 2023 15:13:04 -0800 Subject: [PATCH 01/21] introduce MT_TRACE to clean up SMP_DEBUG code --- driver/others/blas_server_win32.c | 60 ++++++++++++------------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 40ff85abc..5820a55f4 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -48,6 +48,12 @@ #endif #endif +#ifdef SMP_DEBUG +# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__) +#else +# define MT_TRACE(...) +#endif + /* This is a thread implementation for Win32 lazy implementation */ /* Thread server common information */ @@ -213,29 +219,24 @@ static DWORD WINAPI blas_thread_server(void *arg){ /* Each server needs each buffer */ buffer = blas_memory_alloc(2); -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); -#endif + MT_TRACE("Server[%2ld] Thread is started!\n", cpu); while (1){ /* Waiting for Queue */ -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); -#endif + MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); + // event raised when work is added to the queue WaitForSingleObject(kickoff_event, INFINITE); if (cpu > thread_target - 2) { - //printf("thread [%d] exiting.\n", cpu); + //MT_TRACE("thread [%d] exiting.\n", cpu); break; // excess thread, so worker thread exits } -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); -#endif + MT_TRACE("Server[%2ld] Got it.\n", cpu); #if 1 EnterCriticalSection(&queue_lock); @@ -270,10 +271,8 @@ static DWORD WINAPI blas_thread_server(void *arg){ __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); #endif -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); -#endif // fprintf(stderr, "queue start[%ld]!!!\n", cpu); @@ -342,19 +341,14 @@ static DWORD WINAPI blas_thread_server(void *arg){ continue; //if queue == NULL } -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); -#endif + MT_TRACE("Server[%2ld] Finished!\n", cpu); queue->finished = 1; - } /* Shutdown procedure */ -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); -#endif + MT_TRACE("Server[%2ld] Shutdown!\n", cpu); blas_memory_free(buffer); @@ -369,10 +363,7 @@ int blas_thread_init(void){ LOCK_COMMAND(&server_lock); -#ifdef SMP_DEBUG - fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", - blas_cpu_number); -#endif + MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); if (!blas_server_avail){ // create the kickoff Event @@ -383,7 +374,7 @@ int blas_thread_init(void){ InitializeCriticalSection(&queue_lock); for(i = 0; i < blas_cpu_number - 1; i++){ - //printf("thread_init: creating thread [%d]\n", i); + //MT_TRACE("thread_init: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, @@ -458,14 +449,10 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ -#ifdef SMP_DEBUG - fprintf(STDERR, "Synchronization Waiting.\n"); -#endif + MT_TRACE("Synchronization Waiting.\n"); while (num){ -#ifdef SMP_DEBUG - fprintf(STDERR, "Waiting Queue ..\n"); -#endif + MT_TRACE("Waiting Queue ..\n"); while (!queue->finished) YIELDING; @@ -473,9 +460,8 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ num--; } -#ifdef SMP_DEBUG - fprintf(STDERR, "Completely Done.\n\n"); -#endif + MT_TRACE("Completely Done.\n\n"); + // if work was added to the queue after this batch we can't sleep the worker threads // by resetting the event EnterCriticalSection(&queue_lock); @@ -577,11 +563,11 @@ void goto_set_num_threads(int num_threads) SetEvent(kickoff_event); for (i = num_threads - 1; i < blas_num_threads - 1; i++) { - //printf("set_num_threads: waiting on thread [%d] to quit.\n", i); + //MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i); WaitForSingleObject(blas_threads[i], INFINITE); - //printf("set_num_threads: thread [%d] has quit.\n", i); + //MT_TRACE("set_num_threads: thread [%d] has quit.\n", i); CloseHandle(blas_threads[i]); } @@ -610,7 +596,7 @@ void goto_set_num_threads(int num_threads) } for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ - //printf("set_num_threads: creating thread [%d]\n", i); + //MT_TRACE("set_num_threads: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, From 0d7fe5ea610d46afaed9f5164f6a11729e2429de Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Mon, 29 Jan 2024 22:33:47 -0800 Subject: [PATCH 02/21] clean up whitespace --- driver/others/blas_server_win32.c | 192 +++++++++++++----------------- 1 file changed, 85 insertions(+), 107 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 5820a55f4..68dde584b 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -72,19 +72,9 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; static volatile int thread_target; // target num of live threads, volatile for cross-thread reads -#if defined (__GNUC__) && (__GNUC__ < 6) - #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) -#else - #if defined(_WIN64) - #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) - #else - #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) - #endif -#endif - static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ - if (!(mode & BLAS_COMPLEX)){ + if (!(mode & BLAS_COMPLEX)) { #ifdef EXPRECISION if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* REAL / Extended Double */ @@ -99,7 +89,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if ((mode & BLAS_PREC) == BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE) { /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, @@ -110,7 +100,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ + } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, @@ -122,7 +112,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> b, args -> ldb, args -> c, args -> ldc, sb); #ifdef BUILD_BFLOAT16 - } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) { /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, @@ -133,7 +123,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + } else if ((mode & BLAS_PREC) == BLAS_STOBF16) { /* REAL / BLAS_STOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, bfloat16 *, BLASLONG, @@ -144,7 +134,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) { /* REAL / BLAS_DTOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, bfloat16 *, BLASLONG, @@ -161,7 +151,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } else { #ifdef EXPRECISION - if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE) { /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -175,7 +165,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if ((mode & BLAS_PREC) == BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE) { /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, @@ -205,10 +195,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } -/* This is a main routine of threads. Each thread waits until job is */ -/* queued. */ - -static DWORD WINAPI blas_thread_server(void *arg){ +// This is a main routine of threads. Each thread waits until job is +// queued. +static DWORD WINAPI blas_thread_server(void *arg) { /* Thread identifier */ BLASLONG cpu = (BLASLONG)arg; @@ -221,24 +210,22 @@ static DWORD WINAPI blas_thread_server(void *arg){ MT_TRACE("Server[%2ld] Thread is started!\n", cpu); - while (1){ + while (1) { /* Waiting for Queue */ MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); - // event raised when work is added to the queue - WaitForSingleObject(kickoff_event, INFINITE); + // event raised when work is added to the queue + WaitForSingleObject(kickoff_event, INFINITE); - if (cpu > thread_target - 2) - { - //MT_TRACE("thread [%d] exiting.\n", cpu); - break; // excess thread, so worker thread exits - } + if (cpu > thread_target - 2) { + //MT_TRACE("thread [%d] exiting.\n", cpu); + break; // excess thread, so worker thread exits + } MT_TRACE("Server[%2ld] Got it.\n", cpu); -#if 1 EnterCriticalSection(&queue_lock); queue = work_queue; @@ -246,51 +233,39 @@ static DWORD WINAPI blas_thread_server(void *arg){ work_queue = work_queue->next; LeaveCriticalSection(&queue_lock); -#else - volatile blas_queue_t* queue_next; - INT_PTR prev_value; - do { - queue = (volatile blas_queue_t*)work_queue; - if (!queue) - break; - - queue_next = (volatile blas_queue_t*)queue->next; - prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); - } while (prev_value != queue); -#endif - - if (queue) { + if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; sa = queue -> sa; sb = queue -> sb; -#ifdef CONSISTENT_FPCSR - __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); - __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); -#endif + #ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); + #endif MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); // fprintf(stderr, "queue start[%ld]!!!\n", cpu); -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING1; -#endif + #ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; + #endif - if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + if (sa == NULL) + sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sb == NULL) { - if (!(queue -> mode & BLAS_COMPLEX)){ + if (!(queue -> mode & BLAS_COMPLEX)) { #ifdef EXPRECISION - if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { #ifdef BUILD_DOUBLE sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); @@ -324,26 +299,25 @@ static DWORD WINAPI blas_thread_server(void *arg){ /* Other types in future */ } } - queue->sb=sb; + queue->sb=sb; } -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING2; -#endif + #ifdef MONITOR + main_status[cpu] = MAIN_RUNNING2; + #endif if (!(queue -> mode & BLAS_LEGACY)) { - - (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); } else { - legacy_exec(routine, queue -> mode, queue -> args, sb); + legacy_exec(routine, queue -> mode, queue -> args, sb); } - }else{ - continue; //if queue == NULL - } + } else { + continue; //if queue == NULL + } MT_TRACE("Server[%2ld] Finished!\n", cpu); - queue->finished = 1; + queue->finished = 1; } /* Shutdown procedure */ @@ -353,10 +327,12 @@ static DWORD WINAPI blas_thread_server(void *arg){ blas_memory_free(buffer); return 0; - } +} -/* Initializing routine */ -int blas_thread_init(void){ +// +// Initializing routine +// +int blas_thread_init(void) { BLASLONG i; if (blas_server_avail || (blas_cpu_number <= 1)) return 0; @@ -365,16 +341,16 @@ int blas_thread_init(void){ MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); - if (!blas_server_avail){ - // create the kickoff Event - kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); + if (!blas_server_avail) { + // create the kickoff Event + kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - thread_target = blas_cpu_number; + thread_target = blas_cpu_number; InitializeCriticalSection(&queue_lock); - for(i = 0; i < blas_cpu_number - 1; i++){ - //MT_TRACE("thread_init: creating thread [%d]\n", i); + for(i = 0; i < blas_cpu_number - 1; i++) { + //MT_TRACE("thread_init: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, @@ -391,13 +367,10 @@ int blas_thread_init(void){ /* User can call one of two routines. - exec_blas_async ... immediately returns after jobs are queued. - exec_blas ... returns after jobs are finished. */ - -int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ +int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { #if defined(SMP_SERVER) // Handle lazy re-init of the thread-pool after a POSIX fork @@ -417,7 +390,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); #endif - current->finished = 0; + current->finished = 0; current = current -> next; pos ++; } @@ -426,18 +399,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ if (!work_queue) { - work_queue = queue; + work_queue = queue; } else { blas_queue_t *next_item = work_queue; - // find the end of the work queue - while (next_item) - next_item = next_item->next; + // find the end of the work queue + while (next_item) + next_item = next_item->next; - // add new work to the end - next_item = queue; + // add new work to the end + next_item = queue; } LeaveCriticalSection(&queue_lock); @@ -447,20 +420,24 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ return 0; } -int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ +// +// Join. Wait for all queued tasks to complete +// +int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { - MT_TRACE("Synchronization Waiting.\n"); + MT_TRACE("Synchronization Waiting.\n"); - while (num){ - MT_TRACE("Waiting Queue ..\n"); - while (!queue->finished) - YIELDING; + while (num) { + MT_TRACE("Waiting Queue ..\n"); - queue = queue->next; - num--; - } + while (!queue->finished) + YIELDING; - MT_TRACE("Completely Done.\n\n"); + queue = queue->next; + num--; + } + + MT_TRACE("Completely Done.\n\n"); // if work was added to the queue after this batch we can't sleep the worker threads // by resetting the event @@ -474,8 +451,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ return 0; } -/* Execute Threads */ -int exec_blas(BLASLONG num, blas_queue_t *queue){ +// +// Execute Threads +// +int exec_blas(BLASLONG num, blas_queue_t *queue) { #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) // Handle lazy re-init of the thread-pool after a POSIX fork @@ -507,9 +486,8 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ return 0; } -/* Shutdown procedure, but user don't have to call this routine. The */ -/* kernel automatically kill threads. */ - +// Shutdown procedure, but user don't have to call this routine. The +// kernel automatically kill threads. int BLASFUNC(blas_thread_shutdown)(void){ int i; @@ -518,9 +496,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ LOCK_COMMAND(&server_lock); - if (blas_server_avail){ + if (blas_server_avail) { - for(i = 0; i < blas_num_threads - 1; i++){ + for(i = 0; i < blas_num_threads - 1; i++) { // Could also just use WaitForMultipleObjects DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); @@ -555,7 +533,7 @@ void goto_set_num_threads(int num_threads) if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; - if (blas_server_avail && num_threads < blas_num_threads) { + if (blas_server_avail && num_threads < blas_num_threads) { LOCK_COMMAND(&server_lock); thread_target = num_threads; @@ -586,7 +564,7 @@ void goto_set_num_threads(int num_threads) thread_target = num_threads; //increased_threads = 1; - if (!blas_server_avail){ + if (!blas_server_avail) { // create the kickoff Event kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); @@ -595,7 +573,7 @@ void goto_set_num_threads(int num_threads) blas_server_avail = 1; } - for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ + for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { //MT_TRACE("set_num_threads: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, From 42cb567f0f9f6c8ef27558e5b61251b0805aae6d Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Wed, 31 Jan 2024 13:24:28 -0800 Subject: [PATCH 03/21] more cleanup --- driver/others/blas_server_win32.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 68dde584b..ee6d08f8c 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -72,7 +72,10 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; static volatile int thread_target; // target num of live threads, volatile for cross-thread reads -static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ +// +// +// +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { if (!(mode & BLAS_COMPLEX)) { #ifdef EXPRECISION @@ -195,8 +198,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } -// This is a main routine of threads. Each thread waits until job is -// queued. +// +// This is a main routine of threads. Each thread waits until job is queued. +// static DWORD WINAPI blas_thread_server(void *arg) { /* Thread identifier */ @@ -488,7 +492,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue) { // Shutdown procedure, but user don't have to call this routine. The // kernel automatically kill threads. -int BLASFUNC(blas_thread_shutdown)(void){ +int BLASFUNC(blas_thread_shutdown)(void) { int i; @@ -563,7 +567,7 @@ void goto_set_num_threads(int num_threads) thread_target = num_threads; - //increased_threads = 1; + //increased_threads = 1; if (!blas_server_avail) { // create the kickoff Event kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); From 98c56a7314dbc0032152b7658c73c203124963f9 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Thu, 8 Feb 2024 13:50:15 -0800 Subject: [PATCH 04/21] more cleanup --- driver/others/blas_server_win32.c | 35 ++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index ee6d08f8c..89ce9e656 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -73,7 +73,7 @@ static DWORD blas_threads_id[MAX_CPU_NUMBER]; static volatile int thread_target; // target num of live threads, volatile for cross-thread reads // -// +// Legacy code path // static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { @@ -369,11 +369,11 @@ int blas_thread_init(void) { return 0; } -/* - User can call one of two routines. - exec_blas_async ... immediately returns after jobs are queued. - exec_blas ... returns after jobs are finished. -*/ +// +// User can call one of two routines. +// exec_blas_async ... immediately returns after jobs are queued. +// exec_blas ... returns after jobs are finished. +// int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { #if defined(SMP_SERVER) @@ -471,27 +471,32 @@ int exec_blas(BLASLONG num, blas_queue_t *queue) { if ((num <= 0) || (queue == NULL)) return 0; - if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); + if ((num > 1) && queue -> next) + exec_blas_async(1, queue -> next); routine = queue -> routine; if (queue -> mode & BLAS_LEGACY) { legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); - } else + } else { if (queue -> mode & BLAS_PTHREAD) { void (*pthreadcompat)(void *) = queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, - queue -> sa, queue -> sb, 0); + queue -> sa, queue -> sb, 0); + } - if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + if ((num > 1) && queue -> next) + exec_blas_async_wait(num - 1, queue -> next); return 0; } +// // Shutdown procedure, but user don't have to call this routine. The // kernel automatically kill threads. +// int BLASFUNC(blas_thread_shutdown)(void) { int i; @@ -502,7 +507,7 @@ int BLASFUNC(blas_thread_shutdown)(void) { if (blas_server_avail) { - for(i = 0; i < blas_num_threads - 1; i++) { + for (i = 0; i < blas_num_threads - 1; i++) { // Could also just use WaitForMultipleObjects DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); @@ -524,6 +529,9 @@ int BLASFUNC(blas_thread_shutdown)(void) { return 0; } +// +// Legacy function to set numbef of threads +// void goto_set_num_threads(int num_threads) { long i; @@ -577,7 +585,7 @@ void goto_set_num_threads(int num_threads) blas_server_avail = 1; } - for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { + for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { //MT_TRACE("set_num_threads: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, @@ -593,6 +601,9 @@ void goto_set_num_threads(int num_threads) blas_cpu_number = num_threads; } +// +// Openblas function to set thread count +// void openblas_set_num_threads(int num) { goto_set_num_threads(num); From 10548a0460d0b6abd160e69cd7ca727d41681584 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Mon, 12 Feb 2024 10:22:12 -0800 Subject: [PATCH 05/21] update contributors --- CONTRIBUTORS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 493747052..8f7abc5f8 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -218,4 +218,6 @@ In chronological order: * [2022-08] Fix building from sources for QNX * Mark Seminatore - * [2023-11-09] Improve Windows threading performance scaling \ No newline at end of file + * [2023-11-09] Improve Windows threading performance scaling + * [2024-02-09] Introduce MT_TRACE facility and improve code consistency + \ No newline at end of file From ba17758c02134acb327a4b71202be6be15e36dbd Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 16 Feb 2024 15:58:02 +0000 Subject: [PATCH 06/21] fix axpy implementations where y has a stride of 0 --- kernel/riscv64/axpy_rvv.c | 26 ++++++++++++++++++++++++-- kernel/riscv64/axpy_vector.c | 24 +++++++++++++++++++++++- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c index 8bc2f30de..2d5293f76 100644 --- a/kernel/riscv64/axpy_rvv.c +++ b/kernel/riscv64/axpy_rvv.c @@ -30,19 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_M1_T vfloat32m1_t #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSEV_FLOAT_M1 __riscv_vse32_v_f32m1 #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_M1_T vfloat64m1_t #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSEV_FLOAT_M1 __riscv_vse64_v_f64m1 #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -76,7 +86,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSEV_FLOAT(y, vy, vl); } - } else if (1 == inc_x) { + } else if (1 == inc_x && 0 != inc_y) { BLASLONG stride_y = inc_y * sizeof(FLOAT); @@ -89,8 +99,20 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSSEV_FLOAT(y, stride_y, vy, vl); } - } else { + } else if( 0 == inc_y ) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + size_t in_vl = VSETVL(n); + vy = VFMVVF_FLOAT( y[0], in_vl ); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + } + FLOAT_V_M1_T vres = VFMVVF_FLOAT_M1( 0.0f, 1 ); + vres = VFREDSUMVS_FLOAT( vy, vres, in_vl ); + VSEV_FLOAT_M1(y, vres, 1); + } else { BLASLONG stride_x = inc_x * sizeof(FLOAT); BLASLONG stride_y = inc_y * sizeof(FLOAT); diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 6dffe5f09..c77a18afa 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -51,11 +51,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_M1_T JOIN(vfloat, ELEN, m1, _t, _) #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) #define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) + +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { @@ -123,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSEV_FLOAT(&y[j], vy0, gvl); j += gvl; } - }else if(inc_x == 1){ + } else if (1 == inc_x && 0 != inc_y) { stride_y = inc_y * sizeof(FLOAT); gvl = VSETVL(n); if(gvl <= n/2){ @@ -151,6 +160,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); j += gvl; } + } else if( 0 == inc_y ) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + size_t in_vl = VSETVL(n); + vy0 = VFMVVF_FLOAT( y[0], in_vl ); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + vx0 = VLSEV_FLOAT(x, stride_x, vl); + vy0 = VFMACCVF_FLOAT(vy0, da, vx0, vl); + } + FLOAT_V_M1_T v_res = VFMVVF_FLOAT_M1( 0.0f, 1 ); + v_res = VFREDSUMVS_FLOAT( vy0, v_res, in_vl ); + y[0] = EXTRACT_FLOAT(v_res); }else{ stride_x = inc_x * sizeof(FLOAT); stride_y = inc_y * sizeof(FLOAT); From 461ecabb2249fd598b325a91d2b4dfccbc90a824 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 16 Feb 2024 11:33:28 +0000 Subject: [PATCH 07/21] add RISCV64_ZVL128B and RISCV64_ZVL256B targets to CI flows and to README.md --- .github/workflows/riscv64_vector.yml | 253 +++++++++++++++++++++++++++ README.md | 10 ++ common_riscv64.h | 2 +- 3 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/riscv64_vector.yml diff --git a/.github/workflows/riscv64_vector.yml b/.github/workflows/riscv64_vector.yml new file mode 100644 index 000000000..dd6fe9ca8 --- /dev/null +++ b/.github/workflows/riscv64_vector.yml @@ -0,0 +1,253 @@ +name: riscv64 zvl256b qemu test + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: ubuntu-latest + env: + triple: riscv64-unknown-linux-gnu + riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain + riscv_gnu_toolchain_version: 13.2.0 + riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz + strategy: + fail-fast: false + matrix: + include: + - target: RISCV64_ZVL128B + opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64 + qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64 + - target: RISCV64_ZVL256B + opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 + qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: install build deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make \ + libgomp1-riscv64-cross ccache + wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path} + tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: build OpenBLAS libs + run: | + export PATH="/opt/riscv/bin:$PATH" + make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ + CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \ + AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ + RANLIB='ccache ${triple}-ranlib' \ + FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ + HOSTCC=gcc HOSTFC=gfortran -j$(nproc) + + - name: build OpenBLAS tests + run: | + export PATH="/opt/riscv/bin:$PATH" + make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ + CC='${triple}-gcc' \ + AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ + RANLIB='ccache ${triple}-ranlib' \ + FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ + HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests + + - name: build lapack-netlib tests + working-directory: ./lapack-netlib/TESTING + run: | + export PATH="/opt/riscv/bin:$PATH" + make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ + CC='${triple}-gcc' \ + AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ + RANLIB='ccache ${triple}-ranlib' \ + FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ + HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \ + LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \ + LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \ + LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \ + + - name: OpenBLAS tests + shell: bash + run: | + export PATH="/opt/riscv/bin:$PATH" + export QEMU_CPU=${{ matrix.qemu_cpu }} + rm -rf ./test_out + mkdir -p ./test_out + run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \ + echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \ + if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \ + else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \ + RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \ + } + run_test test cblat1 & + run_test test cblat2 cblat2.dat & + run_test test cblat3 cblat3.dat & + run_test test dblat1 & + run_test test dblat2 dblat2.dat & + run_test test dblat3 dblat3.dat & + run_test test sblat1 & + run_test test sblat2 sblat2.dat & + run_test test sblat3 sblat3.dat & + run_test test zblat1 & + run_test test zblat2 zblat2.dat & + run_test test zblat3 zblat3.dat & + run_test ctest xccblat1 & + run_test ctest xccblat2 cin2 & + run_test ctest xccblat3 cin3 & + run_test ctest xdcblat1 & + run_test ctest xdcblat2 din2 & + run_test ctest xdcblat3 din3 & + run_test ctest xscblat1 & + run_test ctest xscblat2 sin2 & + run_test ctest xscblat3 sin3 & + run_test ctest xzcblat1 & + run_test ctest xzcblat2 zin2 & + run_test ctest xzcblat3 zin3 & + wait + while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) + if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi + + - name: netlib tests + shell: bash + run: | + : # these take a very long time + echo "Skipping netlib tests in CI" + exit 0 + : # comment out exit above to enable the tests + : # probably we want to identify a subset to run in CI + export PATH="/opt/riscv/bin:$PATH" + export QEMU_CPU=${{ matrix.qemu_cpu }} + rm -rf ./test_out + mkdir -p ./test_out + run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \ + echo "$4" >> $OUTPUT; \ + echo "$CMD" >> $OUTPUT; \ + qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \ + RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \ + if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \ + if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \ + } + run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" & + run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" & + run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" & + run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" & + run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" & + run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" & + run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" & + run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" & + run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" & + run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" & + run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" & + run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" & + run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" & + run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" & + run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" & + run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" & + run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" & + run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" & + run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" & + run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" & + run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" & + run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" & + run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" & + run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" & + run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" & + run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" & + run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" & + run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" & + run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" & + run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" & + run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" & + run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & + run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" & + run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" & + run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" & + run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" & + run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" & + run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" & + run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" & + run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" & + run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" & + run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" & + run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" & + run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" & + run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" & + run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" & + run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" & + run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" & + run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" & + run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" & + run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" & + run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" & + run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" & + run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" & + run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" & + run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" & + run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" & + run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" & + run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" & + run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & + run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" & + run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" & + run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" & + run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" & + run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" & + run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" & + wait + while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) + python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary + TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)" + NUMERICAL_ERRORS=-1 + OTHER_ERRORS=-1 + . <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary + if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi + if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi diff --git a/README.md b/README.md index 2f0a0da4c..43f390db0 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran ``` +- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available. +e.g.: + ```sh +make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \ + BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \ + AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \ + LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \ + HOSTCC=gcc HOSTFC=gfortran -j + ``` + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. diff --git a/common_riscv64.h b/common_riscv64.h index ab3bfa25a..eccfc644f 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) +#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280) # include #endif From 4787a55c64a17d80020c370c3e439f362979c83e Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 20 Feb 2024 20:41:45 +0800 Subject: [PATCH 08/21] Optimized cgemm kernel 16x4 LASX for LoongArch --- kernel/generic/zhemm_ltcopy_16.c | 1170 ++++++ kernel/generic/zhemm_utcopy_16.c | 1168 ++++++ kernel/generic/zneg_tcopy_16.c | 587 +++ kernel/generic/zsymm_lcopy_16.c | 333 ++ kernel/generic/zsymm_ucopy_16.c | 332 ++ kernel/generic/ztrmm_lncopy_16.c | 2310 ++++++++++++ kernel/generic/ztrmm_ltcopy_16.c | 2313 ++++++++++++ kernel/generic/ztrmm_uncopy_16.c | 2316 ++++++++++++ kernel/generic/ztrmm_utcopy_16.c | 2318 ++++++++++++ kernel/generic/ztrsm_lncopy_16.c | 308 ++ kernel/generic/ztrsm_ltcopy_16.c | 264 ++ kernel/generic/ztrsm_uncopy_16.c | 313 ++ kernel/generic/ztrsm_utcopy_16.c | 261 ++ kernel/loongarch64/KERNEL.LOONGSON3R5 | 10 +- kernel/loongarch64/cgemm_kernel_16x4_lasx.S | 3757 +++++++++++++++++++ kernel/loongarch64/cgemm_ncopy_16_lasx.S | 691 ++++ kernel/loongarch64/cgemm_ncopy_4_lasx.S | 325 ++ kernel/loongarch64/cgemm_tcopy_16_lasx.S | 741 ++++ kernel/loongarch64/cgemm_tcopy_4_lasx.S | 306 ++ param.h | 12 +- 20 files changed, 19828 insertions(+), 7 deletions(-) create mode 100644 kernel/generic/zhemm_ltcopy_16.c create mode 100644 kernel/generic/zhemm_utcopy_16.c create mode 100644 kernel/generic/zneg_tcopy_16.c create mode 100644 kernel/generic/zsymm_lcopy_16.c create mode 100644 kernel/generic/zsymm_ucopy_16.c create mode 100644 kernel/generic/ztrmm_lncopy_16.c create mode 100644 kernel/generic/ztrmm_ltcopy_16.c create mode 100644 kernel/generic/ztrmm_uncopy_16.c create mode 100644 kernel/generic/ztrmm_utcopy_16.c create mode 100644 kernel/generic/ztrsm_lncopy_16.c create mode 100644 kernel/generic/ztrsm_ltcopy_16.c create mode 100644 kernel/generic/ztrsm_uncopy_16.c create mode 100644 kernel/generic/ztrsm_utcopy_16.c create mode 100644 kernel/loongarch64/cgemm_kernel_16x4_lasx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_16_lasx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_4_lasx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_16_lasx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_4_lasx.S diff --git a/kernel/generic/zhemm_ltcopy_16.c b/kernel/generic/zhemm_ltcopy_16.c new file mode 100644 index 000000000..8797891ea --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_16.c @@ -0,0 +1,1170 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; + if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; + if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; + if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; + if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; + if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; + if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; + if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + if (offset > -8) ao9 += lda; else ao9 += 2; + if (offset > -9) ao10 += lda; else ao10 += 2; + if (offset > -10) ao11 += lda; else ao11 += 2; + if (offset > -11) ao12 += lda; else ao12 += 2; + if (offset > -12) ao13 += lda; else ao13 += 2; + if (offset > -13) ao14 += lda; else ao14 += 2; + if (offset > -14) ao15 += lda; else ao15 += 2; + if (offset > -15) ao16 += lda; else ao16 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + } else + if (offset < -15) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -8 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = ZERO; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -9 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = ZERO; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -10 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = ZERO; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -11 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = ZERO; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -12 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = ZERO; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -13 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = ZERO; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -14 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = ZERO; + b[30] = data31; + b[31] = data32; + break; + case -15 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = ZERO; + break; + } + } + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_16.c b/kernel/generic/zhemm_utcopy_16.c new file mode 100644 index 000000000..822483a83 --- /dev/null +++ b/kernel/generic/zhemm_utcopy_16.c @@ -0,0 +1,1168 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda; + if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda; + if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda; + if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda; + if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda; + if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda; + if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda; + if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + if (offset > -8) ao9 += 2; else ao9 += lda; + if (offset > -9) ao10 += 2; else ao10 += lda; + if (offset > -10) ao11 += 2; else ao11 += lda; + if (offset > -11) ao12 += 2; else ao12 += lda; + if (offset > -12) ao13 += 2; else ao13 += lda; + if (offset > -13) ao14 += 2; else ao14 += lda; + if (offset > -14) ao15 += 2; else ao15 += lda; + if (offset > -15) ao16 += 2; else ao16 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + } else + if (offset < -15) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -8 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = ZERO; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -9 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = ZERO; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -10 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = ZERO; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -11 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = ZERO; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -12 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = ZERO; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -13 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = ZERO; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -14 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = ZERO; + b[30] = data31; + b[31] = -data32; + break; + case -15 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = ZERO; + break; + } + } + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zneg_tcopy_16.c b/kernel/generic/zneg_tcopy_16.c new file mode 100644 index 000000000..50f5a3d37 --- /dev/null +++ b/kernel/generic/zneg_tcopy_16.c @@ -0,0 +1,587 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 32; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + ctemp17 = *(aoffset1 + 16); + ctemp18 = *(aoffset1 + 17); + ctemp19 = *(aoffset1 + 18); + ctemp20 = *(aoffset1 + 19); + ctemp21 = *(aoffset1 + 20); + ctemp22 = *(aoffset1 + 21); + ctemp23 = *(aoffset1 + 22); + ctemp24 = *(aoffset1 + 23); + ctemp25 = *(aoffset1 + 24); + ctemp26 = *(aoffset1 + 25); + ctemp27 = *(aoffset1 + 26); + ctemp28 = *(aoffset1 + 27); + ctemp29 = *(aoffset1 + 28); + ctemp30 = *(aoffset1 + 29); + ctemp31 = *(aoffset1 + 30); + ctemp32 = *(aoffset1 + 31); + + ctemp33 = *(aoffset2 + 0); + ctemp34 = *(aoffset2 + 1); + ctemp35 = *(aoffset2 + 2); + ctemp36 = *(aoffset2 + 3); + ctemp37 = *(aoffset2 + 4); + ctemp38 = *(aoffset2 + 5); + ctemp39 = *(aoffset2 + 6); + ctemp40 = *(aoffset2 + 7); + ctemp41 = *(aoffset2 + 8); + ctemp42 = *(aoffset2 + 9); + ctemp43 = *(aoffset2 + 10); + ctemp44 = *(aoffset2 + 11); + ctemp45 = *(aoffset2 + 12); + ctemp46 = *(aoffset2 + 13); + ctemp47 = *(aoffset2 + 14); + ctemp48 = *(aoffset2 + 15); + ctemp49 = *(aoffset2 + 16); + ctemp50 = *(aoffset2 + 17); + ctemp51 = *(aoffset2 + 18); + ctemp52 = *(aoffset2 + 19); + ctemp53 = *(aoffset2 + 20); + ctemp54 = *(aoffset2 + 21); + ctemp55 = *(aoffset2 + 22); + ctemp56 = *(aoffset2 + 23); + ctemp57 = *(aoffset2 + 24); + ctemp58 = *(aoffset2 + 25); + ctemp59 = *(aoffset2 + 26); + ctemp60 = *(aoffset2 + 27); + ctemp61 = *(aoffset2 + 28); + ctemp62 = *(aoffset2 + 29); + ctemp63 = *(aoffset2 + 30); + ctemp64 = *(aoffset2 + 31); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + *(boffset + 32) = -ctemp33; + *(boffset + 33) = -ctemp34; + *(boffset + 34) = -ctemp35; + *(boffset + 35) = -ctemp36; + *(boffset + 36) = -ctemp37; + *(boffset + 37) = -ctemp38; + *(boffset + 38) = -ctemp39; + *(boffset + 39) = -ctemp40; + + *(boffset + 40) = -ctemp41; + *(boffset + 41) = -ctemp42; + *(boffset + 42) = -ctemp43; + *(boffset + 43) = -ctemp44; + *(boffset + 44) = -ctemp45; + *(boffset + 45) = -ctemp46; + *(boffset + 46) = -ctemp47; + *(boffset + 47) = -ctemp48; + + *(boffset + 48) = -ctemp49; + *(boffset + 49) = -ctemp50; + *(boffset + 50) = -ctemp51; + *(boffset + 51) = -ctemp52; + *(boffset + 52) = -ctemp53; + *(boffset + 53) = -ctemp54; + *(boffset + 54) = -ctemp55; + *(boffset + 55) = -ctemp56; + + *(boffset + 56) = -ctemp57; + *(boffset + 57) = -ctemp58; + *(boffset + 58) = -ctemp59; + *(boffset + 59) = -ctemp60; + *(boffset + 60) = -ctemp61; + *(boffset + 61) = -ctemp62; + *(boffset + 62) = -ctemp63; + *(boffset + 63) = -ctemp64; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 64; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + ctemp17 = *(aoffset1 + 16); + ctemp18 = *(aoffset1 + 17); + ctemp19 = *(aoffset1 + 18); + ctemp20 = *(aoffset1 + 19); + ctemp21 = *(aoffset1 + 20); + ctemp22 = *(aoffset1 + 21); + ctemp23 = *(aoffset1 + 22); + ctemp24 = *(aoffset1 + 23); + ctemp25 = *(aoffset1 + 24); + ctemp26 = *(aoffset1 + 25); + ctemp27 = *(aoffset1 + 26); + ctemp28 = *(aoffset1 + 27); + ctemp29 = *(aoffset1 + 28); + ctemp30 = *(aoffset1 + 29); + ctemp31 = *(aoffset1 + 30); + ctemp32 = *(aoffset1 + 31); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + boffset += 32; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + boffset += 16; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + boffset += 8; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + // aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + // boffset += 2; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_16.c b/kernel/generic/zsymm_lcopy_16.c new file mode 100644 index 000000000..b32374a5e --- /dev/null +++ b/kernel/generic/zsymm_lcopy_16.c @@ -0,0 +1,333 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; + if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; + if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; + if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; + if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; + if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; + if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; + if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; + + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + if (offset > -8) ao9 += lda; else ao9 += 2; + if (offset > -9) ao10 += lda; else ao10 += 2; + if (offset > -10) ao11 += lda; else ao11 += 2; + if (offset > -11) ao12 += lda; else ao12 += 2; + if (offset > -12) ao13 += lda; else ao13 += 2; + if (offset > -13) ao14 += lda; else ao14 += 2; + if (offset > -14) ao15 += lda; else ao15 += 2; + if (offset > -15) ao16 += lda; else ao16 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_16.c b/kernel/generic/zsymm_ucopy_16.c new file mode 100644 index 000000000..cb19bea47 --- /dev/null +++ b/kernel/generic/zsymm_ucopy_16.c @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda; + if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda; + if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda; + if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda; + if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda; + if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda; + if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda; + if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + if (offset > -8) ao9 += 2; else ao9 += lda; + if (offset > -9) ao10 += 2; else ao10 += lda; + if (offset > -10) ao11 += 2; else ao11 += lda; + if (offset > -11) ao12 += 2; else ao12 += lda; + if (offset > -12) ao13 += 2; else ao13 += lda; + if (offset > -13) ao14 += 2; else ao14 += lda; + if (offset > -14) ao15 += 2; else ao15 += lda; + if (offset > -15) ao16 += 2; else ao16 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_16.c b/kernel/generic/ztrmm_lncopy_16.c new file mode 100644 index 000000000..d7fb23176 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_16.c @@ -0,0 +1,2310 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X < posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 512; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 2); + b[ 33] = *(a01 + 3); +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 4); + b[ 65] = *(a01 + 5); + b[ 66] = *(a02 + 4); + b[ 67] = *(a02 + 5); +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 6); + b[ 97] = *(a01 + 7); + b[ 98] = *(a02 + 6); + b[ 99] = *(a02 + 7); + b[100] = *(a03 + 6); + b[101] = *(a03 + 7); +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a01 + 8); + b[129] = *(a01 + 9); + b[130] = *(a02 + 8); + b[131] = *(a02 + 9); + b[132] = *(a03 + 8); + b[133] = *(a03 + 9); + b[134] = *(a04 + 8); + b[135] = *(a04 + 9); +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; + b[153] = ZERO; + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a01 + 10); + b[161] = *(a01 + 11); + b[162] = *(a02 + 10); + b[163] = *(a02 + 11); + b[164] = *(a03 + 10); + b[165] = *(a03 + 11); + b[166] = *(a04 + 10); + b[167] = *(a04 + 11); + b[168] = *(a05 + 10); + b[169] = *(a05 + 11); +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; + b[187] = ZERO; + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a01 + 12); + b[193] = *(a01 + 13); + b[194] = *(a02 + 12); + b[195] = *(a02 + 13); + b[196] = *(a03 + 12); + b[197] = *(a03 + 13); + b[198] = *(a04 + 12); + b[199] = *(a04 + 13); + b[200] = *(a05 + 12); + b[201] = *(a05 + 13); + b[202] = *(a06 + 12); + b[203] = *(a06 + 13); +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = ZERO; + b[207] = ZERO; + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; + b[221] = ZERO; + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a01 + 14); + b[225] = *(a01 + 15); + b[226] = *(a02 + 14); + b[227] = *(a02 + 15); + b[228] = *(a03 + 14); + b[229] = *(a03 + 15); + b[230] = *(a04 + 14); + b[231] = *(a04 + 15); + b[232] = *(a05 + 14); + b[233] = *(a05 + 15); + b[234] = *(a06 + 14); + b[235] = *(a06 + 15); + b[236] = *(a07 + 14); + b[237] = *(a07 + 15); +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; + b[255] = ZERO; + + b[256] = *(a01 + 16); + b[257] = *(a01 + 17); + b[258] = *(a02 + 16); + b[259] = *(a02 + 17); + b[260] = *(a03 + 16); + b[261] = *(a03 + 17); + b[262] = *(a04 + 16); + b[263] = *(a04 + 17); + b[264] = *(a05 + 16); + b[265] = *(a05 + 17); + b[266] = *(a06 + 16); + b[267] = *(a06 + 17); + b[268] = *(a07 + 16); + b[269] = *(a07 + 17); + b[270] = *(a08 + 16); + b[271] = *(a08 + 17); +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = ZERO; + b[275] = ZERO; + b[276] = ZERO; + b[277] = ZERO; + b[278] = ZERO; + b[279] = ZERO; + b[280] = ZERO; + b[281] = ZERO; + b[282] = ZERO; + b[283] = ZERO; + b[284] = ZERO; + b[285] = ZERO; + b[286] = ZERO; + b[287] = ZERO; + + b[288] = *(a01 + 18); + b[289] = *(a01 + 19); + b[290] = *(a02 + 18); + b[291] = *(a02 + 19); + b[292] = *(a03 + 18); + b[293] = *(a03 + 19); + b[294] = *(a04 + 18); + b[295] = *(a04 + 19); + b[296] = *(a05 + 18); + b[297] = *(a05 + 19); + b[298] = *(a06 + 18); + b[299] = *(a06 + 19); + b[300] = *(a07 + 18); + b[301] = *(a07 + 19); + b[302] = *(a08 + 18); + b[303] = *(a08 + 19); + b[304] = *(a09 + 18); + b[305] = *(a09 + 19); +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = ZERO; + b[309] = ZERO; + b[310] = ZERO; + b[311] = ZERO; + b[312] = ZERO; + b[313] = ZERO; + b[314] = ZERO; + b[315] = ZERO; + b[316] = ZERO; + b[317] = ZERO; + b[318] = ZERO; + b[319] = ZERO; + + b[320] = *(a01 + 20); + b[321] = *(a01 + 21); + b[322] = *(a02 + 20); + b[323] = *(a02 + 21); + b[324] = *(a03 + 20); + b[325] = *(a03 + 21); + b[326] = *(a04 + 20); + b[327] = *(a04 + 21); + b[328] = *(a05 + 20); + b[329] = *(a05 + 21); + b[330] = *(a06 + 20); + b[331] = *(a06 + 21); + b[332] = *(a07 + 20); + b[333] = *(a07 + 21); + b[334] = *(a08 + 20); + b[335] = *(a08 + 21); + b[336] = *(a09 + 20); + b[337] = *(a09 + 21); + b[338] = *(a10 + 20); + b[339] = *(a10 + 21); +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = ZERO; + b[343] = ZERO; + b[344] = ZERO; + b[345] = ZERO; + b[346] = ZERO; + b[347] = ZERO; + b[348] = ZERO; + b[349] = ZERO; + b[350] = ZERO; + b[351] = ZERO; + + b[352] = *(a01 + 22); + b[353] = *(a01 + 23); + b[354] = *(a02 + 22); + b[355] = *(a02 + 23); + b[356] = *(a03 + 22); + b[357] = *(a03 + 23); + b[358] = *(a04 + 22); + b[359] = *(a04 + 23); + b[360] = *(a05 + 22); + b[361] = *(a05 + 23); + b[362] = *(a06 + 22); + b[363] = *(a06 + 23); + b[364] = *(a07 + 22); + b[365] = *(a07 + 23); + b[366] = *(a08 + 22); + b[367] = *(a08 + 23); + b[368] = *(a09 + 22); + b[369] = *(a09 + 23); + b[370] = *(a10 + 22); + b[371] = *(a10 + 23); + b[372] = *(a11 + 22); + b[373] = *(a11 + 23); +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = ZERO; + b[377] = ZERO; + b[378] = ZERO; + b[379] = ZERO; + b[380] = ZERO; + b[381] = ZERO; + b[382] = ZERO; + b[383] = ZERO; + + b[384] = *(a01 + 24); + b[385] = *(a01 + 25); + b[386] = *(a02 + 24); + b[387] = *(a02 + 25); + b[388] = *(a03 + 24); + b[389] = *(a03 + 25); + b[390] = *(a04 + 24); + b[391] = *(a04 + 25); + b[392] = *(a05 + 24); + b[393] = *(a05 + 25); + b[394] = *(a06 + 24); + b[395] = *(a06 + 25); + b[396] = *(a07 + 24); + b[397] = *(a07 + 25); + b[398] = *(a08 + 24); + b[399] = *(a08 + 25); + b[400] = *(a09 + 24); + b[401] = *(a09 + 25); + b[402] = *(a10 + 24); + b[403] = *(a10 + 25); + b[404] = *(a11 + 24); + b[405] = *(a11 + 25); + b[406] = *(a12 + 24); + b[407] = *(a12 + 25); +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = ZERO; + b[411] = ZERO; + b[412] = ZERO; + b[413] = ZERO; + b[414] = ZERO; + b[415] = ZERO; + + b[416] = *(a01 + 26); + b[417] = *(a01 + 27); + b[418] = *(a02 + 26); + b[419] = *(a02 + 27); + b[420] = *(a03 + 26); + b[421] = *(a03 + 27); + b[422] = *(a04 + 26); + b[423] = *(a04 + 27); + b[424] = *(a05 + 26); + b[425] = *(a05 + 27); + b[426] = *(a06 + 26); + b[427] = *(a06 + 27); + b[428] = *(a07 + 26); + b[429] = *(a07 + 27); + b[430] = *(a08 + 26); + b[431] = *(a08 + 27); + b[432] = *(a09 + 26); + b[433] = *(a09 + 27); + b[434] = *(a10 + 26); + b[435] = *(a10 + 27); + b[436] = *(a11 + 26); + b[437] = *(a11 + 27); + b[438] = *(a12 + 26); + b[439] = *(a12 + 27); + b[440] = *(a13 + 26); + b[441] = *(a13 + 27); +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = ZERO; + b[445] = ZERO; + b[446] = ZERO; + b[447] = ZERO; + + b[448] = *(a01 + 28); + b[449] = *(a01 + 29); + b[450] = *(a02 + 28); + b[451] = *(a02 + 29); + b[452] = *(a03 + 28); + b[453] = *(a03 + 29); + b[454] = *(a04 + 28); + b[455] = *(a04 + 29); + b[456] = *(a05 + 28); + b[457] = *(a05 + 29); + b[458] = *(a06 + 28); + b[459] = *(a06 + 29); + b[460] = *(a07 + 28); + b[461] = *(a07 + 29); + b[462] = *(a08 + 28); + b[463] = *(a08 + 29); + b[464] = *(a09 + 28); + b[465] = *(a09 + 29); + b[466] = *(a10 + 28); + b[467] = *(a10 + 29); + b[468] = *(a11 + 28); + b[469] = *(a11 + 29); + b[470] = *(a12 + 28); + b[471] = *(a12 + 29); + b[472] = *(a13 + 28); + b[473] = *(a13 + 29); + b[474] = *(a14 + 28); + b[475] = *(a14 + 29); +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = ZERO; + b[479] = ZERO; + + b[480] = *(a01 + 30); + b[481] = *(a01 + 31); + b[482] = *(a02 + 30); + b[483] = *(a02 + 31); + b[484] = *(a03 + 30); + b[485] = *(a03 + 31); + b[486] = *(a04 + 30); + b[487] = *(a04 + 31); + b[488] = *(a05 + 30); + b[489] = *(a05 + 31); + b[490] = *(a06 + 30); + b[491] = *(a06 + 31); + b[492] = *(a07 + 30); + b[493] = *(a07 + 31); + b[494] = *(a08 + 30); + b[495] = *(a08 + 31); + b[496] = *(a09 + 30); + b[497] = *(a09 + 31); + b[498] = *(a10 + 30); + b[499] = *(a10 + 31); + b[500] = *(a11 + 30); + b[501] = *(a11 + 31); + b[502] = *(a12 + 30); + b[503] = *(a12 + 31); + b[504] = *(a13 + 30); + b[505] = *(a13 + 31); + b[506] = *(a14 + 30); + b[507] = *(a14 + 31); + b[508] = *(a15 + 30); + b[509] = *(a15 + 31); +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 4) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a01 + 7); + b[ 2] = *(a02 + 6); + b[ 3] = *(a02 + 7); + b[ 4] = *(a03 + 6); + b[ 5] = *(a03 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 5) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a01 + 9); + b[ 2] = *(a02 + 8); + b[ 3] = *(a02 + 9); + b[ 4] = *(a03 + 8); + b[ 5] = *(a03 + 9); + b[ 6] = *(a04 + 8); + b[ 7] = *(a04 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 6) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a01 + 11); + b[ 2] = *(a02 + 10); + b[ 3] = *(a02 + 11); + b[ 4] = *(a03 + 10); + b[ 5] = *(a03 + 11); + b[ 6] = *(a04 + 10); + b[ 7] = *(a04 + 11); + b[ 8] = *(a05 + 10); + b[ 9] = *(a05 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 7) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a01 + 13); + b[ 2] = *(a02 + 12); + b[ 3] = *(a02 + 13); + b[ 4] = *(a03 + 12); + b[ 5] = *(a03 + 13); + b[ 6] = *(a04 + 12); + b[ 7] = *(a04 + 13); + b[ 8] = *(a05 + 12); + b[ 9] = *(a05 + 13); + b[10] = *(a06 + 12); + b[11] = *(a06 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 8) { + b[ 0] = *(a01 + 14); + b[ 1] = *(a01 + 15); + b[ 2] = *(a02 + 14); + b[ 3] = *(a02 + 15); + b[ 4] = *(a03 + 14); + b[ 5] = *(a03 + 15); + b[ 6] = *(a04 + 14); + b[ 7] = *(a04 + 15); + b[ 8] = *(a05 + 14); + b[ 9] = *(a05 + 15); + b[ 10] = *(a06 + 14); + b[ 11] = *(a06 + 15); + b[ 12] = *(a07 + 14); + b[ 13] = *(a07 + 15); +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 9) { + b[ 0] = *(a01 + 16); + b[ 1] = *(a01 + 17); + b[ 2] = *(a02 + 16); + b[ 3] = *(a02 + 17); + b[ 4] = *(a03 + 16); + b[ 5] = *(a03 + 17); + b[ 6] = *(a04 + 16); + b[ 7] = *(a04 + 17); + b[ 8] = *(a05 + 16); + b[ 9] = *(a05 + 17); + b[ 10] = *(a06 + 16); + b[ 11] = *(a06 + 17); + b[ 12] = *(a07 + 16); + b[ 13] = *(a07 + 17); + b[ 14] = *(a08 + 16); + b[ 15] = *(a08 + 17); +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 10) { + b[ 0] = *(a01 + 18); + b[ 1] = *(a01 + 19); + b[ 2] = *(a02 + 18); + b[ 3] = *(a02 + 19); + b[ 4] = *(a03 + 18); + b[ 5] = *(a03 + 19); + b[ 6] = *(a04 + 18); + b[ 7] = *(a04 + 19); + b[ 8] = *(a05 + 18); + b[ 9] = *(a05 + 19); + b[ 10] = *(a06 + 18); + b[ 11] = *(a06 + 19); + b[ 12] = *(a07 + 18); + b[ 13] = *(a07 + 19); + b[ 14] = *(a08 + 18); + b[ 15] = *(a08 + 19); + b[ 16] = *(a09 + 18); + b[ 17] = *(a09 + 19); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 11) { + b[ 0] = *(a01 + 20); + b[ 1] = *(a01 + 21); + b[ 2] = *(a02 + 20); + b[ 3] = *(a02 + 21); + b[ 4] = *(a03 + 20); + b[ 5] = *(a03 + 21); + b[ 6] = *(a04 + 20); + b[ 7] = *(a04 + 21); + b[ 8] = *(a05 + 20); + b[ 9] = *(a05 + 21); + b[ 10] = *(a06 + 20); + b[ 11] = *(a06 + 21); + b[ 12] = *(a07 + 20); + b[ 13] = *(a07 + 21); + b[ 14] = *(a08 + 20); + b[ 15] = *(a08 + 21); + b[ 16] = *(a09 + 20); + b[ 17] = *(a09 + 21); + b[ 18] = *(a10 + 20); + b[ 19] = *(a10 + 21); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 12) { + b[ 0] = *(a01 + 22); + b[ 1] = *(a01 + 23); + b[ 2] = *(a02 + 22); + b[ 3] = *(a02 + 23); + b[ 4] = *(a03 + 22); + b[ 5] = *(a03 + 23); + b[ 6] = *(a04 + 22); + b[ 7] = *(a04 + 23); + b[ 8] = *(a05 + 22); + b[ 9] = *(a05 + 23); + b[ 10] = *(a06 + 22); + b[ 11] = *(a06 + 23); + b[ 12] = *(a07 + 22); + b[ 13] = *(a07 + 23); + b[ 14] = *(a08 + 22); + b[ 15] = *(a08 + 23); + b[ 16] = *(a09 + 22); + b[ 17] = *(a09 + 23); + b[ 18] = *(a10 + 22); + b[ 19] = *(a10 + 23); + b[ 20] = *(a11 + 22); + b[ 21] = *(a11 + 23); +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 13) { + b[ 0] = *(a01 + 24); + b[ 1] = *(a01 + 25); + b[ 2] = *(a02 + 24); + b[ 3] = *(a02 + 25); + b[ 4] = *(a03 + 24); + b[ 5] = *(a03 + 25); + b[ 6] = *(a04 + 24); + b[ 7] = *(a04 + 25); + b[ 8] = *(a05 + 24); + b[ 9] = *(a05 + 25); + b[ 10] = *(a06 + 24); + b[ 11] = *(a06 + 25); + b[ 12] = *(a07 + 24); + b[ 13] = *(a07 + 25); + b[ 14] = *(a08 + 24); + b[ 15] = *(a08 + 25); + b[ 16] = *(a09 + 24); + b[ 17] = *(a09 + 25); + b[ 18] = *(a10 + 24); + b[ 19] = *(a10 + 25); + b[ 20] = *(a11 + 24); + b[ 21] = *(a11 + 25); + b[ 22] = *(a12 + 24); + b[ 23] = *(a12 + 25); +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 14) { + b[ 0] = *(a01 + 26); + b[ 1] = *(a01 + 27); + b[ 2] = *(a02 + 26); + b[ 3] = *(a02 + 27); + b[ 4] = *(a03 + 26); + b[ 5] = *(a03 + 27); + b[ 6] = *(a04 + 26); + b[ 7] = *(a04 + 27); + b[ 8] = *(a05 + 26); + b[ 9] = *(a05 + 27); + b[ 10] = *(a06 + 26); + b[ 11] = *(a06 + 27); + b[ 12] = *(a07 + 26); + b[ 13] = *(a07 + 27); + b[ 14] = *(a08 + 26); + b[ 15] = *(a08 + 27); + b[ 16] = *(a09 + 26); + b[ 17] = *(a09 + 27); + b[ 18] = *(a10 + 26); + b[ 19] = *(a10 + 27); + b[ 20] = *(a11 + 26); + b[ 21] = *(a11 + 27); + b[ 22] = *(a12 + 26); + b[ 23] = *(a12 + 27); + b[ 24] = *(a13 + 26); + b[ 25] = *(a13 + 27); +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 15) { + b[ 0] = *(a01 + 28); + b[ 1] = *(a01 + 29); + b[ 2] = *(a02 + 28); + b[ 3] = *(a02 + 29); + b[ 4] = *(a03 + 28); + b[ 5] = *(a03 + 29); + b[ 6] = *(a04 + 28); + b[ 7] = *(a04 + 29); + b[ 8] = *(a05 + 28); + b[ 9] = *(a05 + 29); + b[ 10] = *(a06 + 28); + b[ 11] = *(a06 + 29); + b[ 12] = *(a07 + 28); + b[ 13] = *(a07 + 29); + b[ 14] = *(a08 + 28); + b[ 15] = *(a08 + 29); + b[ 16] = *(a09 + 28); + b[ 17] = *(a09 + 29); + b[ 18] = *(a10 + 28); + b[ 19] = *(a10 + 29); + b[ 20] = *(a11 + 28); + b[ 21] = *(a11 + 29); + b[ 22] = *(a12 + 28); + b[ 23] = *(a12 + 29); + b[ 24] = *(a13 + 28); + b[ 25] = *(a13 + 29); + b[ 26] = *(a14 + 28); + b[ 27] = *(a14 + 29); +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X < posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 128; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 2); + b[ 17] = *(a01 + 3); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 4); + b[ 33] = *(a01 + 5); + b[ 34] = *(a02 + 4); + b[ 35] = *(a02 + 5); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a01 + 6); + b[ 49] = *(a01 + 7); + b[ 50] = *(a02 + 6); + b[ 51] = *(a02 + 7); + b[ 52] = *(a03 + 6); + b[ 53] = *(a03 + 7); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 8); + b[ 65] = *(a01 + 9); + b[ 66] = *(a02 + 8); + b[ 67] = *(a02 + 9); + b[ 68] = *(a03 + 8); + b[ 69] = *(a03 + 9); + b[ 70] = *(a04 + 8); + b[ 71] = *(a04 + 9); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a01 + 10); + b[ 81] = *(a01 + 11); + b[ 82] = *(a02 + 10); + b[ 83] = *(a02 + 11); + b[ 84] = *(a03 + 10); + b[ 85] = *(a03 + 11); + b[ 86] = *(a04 + 10); + b[ 87] = *(a04 + 11); + b[ 88] = *(a05 + 10); + b[ 89] = *(a05 + 11); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 12); + b[ 97] = *(a01 + 13); + b[ 98] = *(a02 + 12); + b[ 99] = *(a02 + 13); + b[100] = *(a03 + 12); + b[101] = *(a03 + 13); + b[102] = *(a04 + 12); + b[103] = *(a04 + 13); + b[104] = *(a05 + 12); + b[105] = *(a05 + 13); + b[106] = *(a06 + 12); + b[107] = *(a06 + 13); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a01 + 14); + b[113] = *(a01 + 15); + b[114] = *(a02 + 14); + b[115] = *(a02 + 15); + b[116] = *(a03 + 14); + b[117] = *(a03 + 15); + b[118] = *(a04 + 14); + b[119] = *(a04 + 15); + b[120] = *(a05 + 14); + b[121] = *(a05 + 15); + b[122] = *(a06 + 14); + b[123] = *(a06 + 15); + b[124] = *(a07 + 14); + b[125] = *(a07 + 15); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; */ + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a01 + 7); + b[ 2] = *(a02 + 6); + b[ 3] = *(a02 + 7); + b[ 4] = *(a03 + 6); + b[ 5] = *(a03 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a01 + 9); + b[ 2] = *(a02 + 8); + b[ 3] = *(a02 + 9); + b[ 4] = *(a03 + 8); + b[ 5] = *(a03 + 9); + b[ 6] = *(a04 + 8); + b[ 7] = *(a04 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a01 + 11); + b[ 2] = *(a02 + 10); + b[ 3] = *(a02 + 11); + b[ 4] = *(a03 + 10); + b[ 5] = *(a03 + 11); + b[ 6] = *(a04 + 10); + b[ 7] = *(a04 + 11); + b[ 8] = *(a05 + 10); + b[ 9] = *(a05 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a01 + 13); + b[ 2] = *(a02 + 12); + b[ 3] = *(a02 + 13); + b[ 4] = *(a03 + 12); + b[ 5] = *(a03 + 13); + b[ 6] = *(a04 + 12); + b[ 7] = *(a04 + 13); + b[ 8] = *(a05 + 12); + b[ 9] = *(a05 + 13); + b[10] = *(a06 + 12); + b[11] = *(a06 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X < posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a01 + 2); + b[ 9] = *(a01 + 3); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 4); + b[ 17] = *(a01 + 5); + b[ 18] = *(a02 + 4); + b[ 19] = *(a02 + 5); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a01 + 6); + b[ 25] = *(a01 + 7); + b[ 26] = *(a02 + 6); + b[ 27] = *(a02 + 7); + b[ 28] = *(a03 + 6); + b[ 29] = *(a03 + 7); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; */ + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X < posY) { + /* a01 += 2 * lda; + a02 += 2 * lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += 2; + b += 2; + } else + if (X < posY) { + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_16.c b/kernel/generic/ztrmm_ltcopy_16.c new file mode 100644 index 000000000..8d585e70b --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_16.c @@ -0,0 +1,2313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } else + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + b += 32; + } + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = *(a02 + 4); + b[ 37] = *(a02 + 5); + b[ 38] = *(a02 + 6); + b[ 39] = *(a02 + 7); + b[ 40] = *(a02 + 8); + b[ 41] = *(a02 + 9); + b[ 42] = *(a02 + 10); + b[ 43] = *(a02 + 11); + b[ 44] = *(a02 + 12); + b[ 45] = *(a02 + 13); + b[ 46] = *(a02 + 14); + b[ 47] = *(a02 + 15); + b[ 48] = *(a02 + 16); + b[ 49] = *(a02 + 17); + b[ 50] = *(a02 + 18); + b[ 51] = *(a02 + 19); + b[ 52] = *(a02 + 20); + b[ 53] = *(a02 + 21); + b[ 54] = *(a02 + 22); + b[ 55] = *(a02 + 23); + b[ 56] = *(a02 + 24); + b[ 57] = *(a02 + 25); + b[ 58] = *(a02 + 26); + b[ 59] = *(a02 + 27); + b[ 60] = *(a02 + 28); + b[ 61] = *(a02 + 29); + b[ 62] = *(a02 + 30); + b[ 63] = *(a02 + 31); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = *(a03 + 6); + b[ 71] = *(a03 + 7); + b[ 72] = *(a03 + 8); + b[ 73] = *(a03 + 9); + b[ 74] = *(a03 + 10); + b[ 75] = *(a03 + 11); + b[ 76] = *(a03 + 12); + b[ 77] = *(a03 + 13); + b[ 78] = *(a03 + 14); + b[ 79] = *(a03 + 15); + b[ 80] = *(a03 + 16); + b[ 81] = *(a03 + 17); + b[ 82] = *(a03 + 18); + b[ 83] = *(a03 + 19); + b[ 84] = *(a03 + 20); + b[ 85] = *(a03 + 21); + b[ 86] = *(a03 + 22); + b[ 87] = *(a03 + 23); + b[ 88] = *(a03 + 24); + b[ 89] = *(a03 + 25); + b[ 90] = *(a03 + 26); + b[ 91] = *(a03 + 27); + b[ 92] = *(a03 + 28); + b[ 93] = *(a03 + 29); + b[ 94] = *(a03 + 30); + b[ 95] = *(a03 + 31); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = *(a04 + 8); + b[105] = *(a04 + 9); + b[106] = *(a04 + 10); + b[107] = *(a04 + 11); + b[108] = *(a04 + 12); + b[109] = *(a04 + 13); + b[110] = *(a04 + 14); + b[111] = *(a04 + 15); + b[112] = *(a04 + 16); + b[113] = *(a04 + 17); + b[114] = *(a04 + 18); + b[115] = *(a04 + 19); + b[116] = *(a04 + 20); + b[117] = *(a04 + 21); + b[118] = *(a04 + 22); + b[119] = *(a04 + 23); + b[120] = *(a04 + 24); + b[121] = *(a04 + 25); + b[122] = *(a04 + 26); + b[123] = *(a04 + 27); + b[124] = *(a04 + 28); + b[125] = *(a04 + 29); + b[126] = *(a04 + 30); + b[127] = *(a04 + 31); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = *(a05 + 10); + b[139] = *(a05 + 11); + b[140] = *(a05 + 12); + b[141] = *(a05 + 13); + b[142] = *(a05 + 14); + b[143] = *(a05 + 15); + b[144] = *(a05 + 16); + b[145] = *(a05 + 17); + b[146] = *(a05 + 18); + b[147] = *(a05 + 19); + b[148] = *(a05 + 20); + b[149] = *(a05 + 21); + b[150] = *(a05 + 22); + b[151] = *(a05 + 23); + b[152] = *(a05 + 24); + b[153] = *(a05 + 25); + b[154] = *(a05 + 26); + b[155] = *(a05 + 27); + b[156] = *(a05 + 28); + b[157] = *(a05 + 29); + b[158] = *(a05 + 30); + b[159] = *(a05 + 31); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = *(a06 + 12); + b[173] = *(a06 + 13); + b[174] = *(a06 + 14); + b[175] = *(a06 + 15); + b[176] = *(a06 + 16); + b[177] = *(a06 + 17); + b[178] = *(a06 + 18); + b[179] = *(a06 + 19); + b[180] = *(a06 + 20); + b[181] = *(a06 + 21); + b[182] = *(a06 + 22); + b[183] = *(a06 + 23); + b[184] = *(a06 + 24); + b[185] = *(a06 + 25); + b[186] = *(a06 + 26); + b[187] = *(a06 + 27); + b[188] = *(a06 + 28); + b[189] = *(a06 + 29); + b[190] = *(a06 + 30); + b[191] = *(a06 + 31); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = *(a07 + 14); + b[207] = *(a07 + 15); + b[208] = *(a07 + 16); + b[209] = *(a07 + 17); + b[210] = *(a07 + 18); + b[211] = *(a07 + 19); + b[212] = *(a07 + 20); + b[213] = *(a07 + 21); + b[214] = *(a07 + 22); + b[215] = *(a07 + 23); + b[216] = *(a07 + 24); + b[217] = *(a07 + 25); + b[218] = *(a07 + 26); + b[219] = *(a07 + 27); + b[220] = *(a07 + 28); + b[221] = *(a07 + 29); + b[222] = *(a07 + 30); + b[223] = *(a07 + 31); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = *(a08 + 16); + b[241] = *(a08 + 17); + b[242] = *(a08 + 18); + b[243] = *(a08 + 19); + b[244] = *(a08 + 20); + b[245] = *(a08 + 21); + b[246] = *(a08 + 22); + b[247] = *(a08 + 23); + b[248] = *(a08 + 24); + b[249] = *(a08 + 25); + b[250] = *(a08 + 26); + b[251] = *(a08 + 27); + b[252] = *(a08 + 28); + b[253] = *(a08 + 29); + b[254] = *(a08 + 30); + b[255] = *(a08 + 31); + + b[256] = ZERO; + b[257] = ZERO; + b[258] = ZERO; + b[259] = ZERO; + b[260] = ZERO; + b[261] = ZERO; + b[262] = ZERO; + b[263] = ZERO; + b[264] = ZERO; + b[265] = ZERO; + b[266] = ZERO; + b[267] = ZERO; + b[268] = ZERO; + b[269] = ZERO; + b[270] = ZERO; + b[271] = ZERO; +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = *(a09 + 18); + b[275] = *(a09 + 19); + b[276] = *(a09 + 20); + b[277] = *(a09 + 21); + b[278] = *(a09 + 22); + b[279] = *(a09 + 23); + b[280] = *(a09 + 24); + b[281] = *(a09 + 25); + b[282] = *(a09 + 26); + b[283] = *(a09 + 27); + b[284] = *(a09 + 28); + b[285] = *(a09 + 29); + b[286] = *(a09 + 30); + b[287] = *(a09 + 31); + + b[288] = ZERO; + b[289] = ZERO; + b[290] = ZERO; + b[291] = ZERO; + b[292] = ZERO; + b[293] = ZERO; + b[294] = ZERO; + b[295] = ZERO; + b[296] = ZERO; + b[297] = ZERO; + b[298] = ZERO; + b[299] = ZERO; + b[300] = ZERO; + b[301] = ZERO; + b[302] = ZERO; + b[303] = ZERO; + b[304] = ZERO; + b[305] = ZERO; +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = *(a10 + 20); + b[309] = *(a10 + 21); + b[310] = *(a10 + 22); + b[311] = *(a10 + 23); + b[312] = *(a10 + 24); + b[313] = *(a10 + 25); + b[314] = *(a10 + 26); + b[315] = *(a10 + 27); + b[316] = *(a10 + 28); + b[317] = *(a10 + 29); + b[318] = *(a10 + 30); + b[319] = *(a10 + 31); + + b[320] = ZERO; + b[321] = ZERO; + b[322] = ZERO; + b[323] = ZERO; + b[324] = ZERO; + b[325] = ZERO; + b[326] = ZERO; + b[327] = ZERO; + b[328] = ZERO; + b[329] = ZERO; + b[330] = ZERO; + b[331] = ZERO; + b[332] = ZERO; + b[333] = ZERO; + b[334] = ZERO; + b[335] = ZERO; + b[336] = ZERO; + b[337] = ZERO; + b[338] = ZERO; + b[339] = ZERO; +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = *(a11 + 22); + b[343] = *(a11 + 23); + b[344] = *(a11 + 24); + b[345] = *(a11 + 25); + b[346] = *(a11 + 26); + b[347] = *(a11 + 27); + b[348] = *(a11 + 28); + b[349] = *(a11 + 29); + b[350] = *(a11 + 30); + b[351] = *(a11 + 31); + + b[352] = ZERO; + b[353] = ZERO; + b[354] = ZERO; + b[355] = ZERO; + b[356] = ZERO; + b[357] = ZERO; + b[358] = ZERO; + b[359] = ZERO; + b[360] = ZERO; + b[361] = ZERO; + b[362] = ZERO; + b[363] = ZERO; + b[364] = ZERO; + b[365] = ZERO; + b[366] = ZERO; + b[367] = ZERO; + b[368] = ZERO; + b[369] = ZERO; + b[370] = ZERO; + b[371] = ZERO; + b[372] = ZERO; + b[373] = ZERO; +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = *(a12 + 24); + b[377] = *(a12 + 25); + b[378] = *(a12 + 26); + b[379] = *(a12 + 27); + b[380] = *(a12 + 28); + b[381] = *(a12 + 29); + b[382] = *(a12 + 30); + b[383] = *(a12 + 31); + + b[384] = ZERO; + b[385] = ZERO; + b[386] = ZERO; + b[387] = ZERO; + b[388] = ZERO; + b[389] = ZERO; + b[390] = ZERO; + b[391] = ZERO; + b[392] = ZERO; + b[393] = ZERO; + b[394] = ZERO; + b[395] = ZERO; + b[396] = ZERO; + b[397] = ZERO; + b[398] = ZERO; + b[399] = ZERO; + b[400] = ZERO; + b[401] = ZERO; + b[402] = ZERO; + b[403] = ZERO; + b[404] = ZERO; + b[405] = ZERO; + b[406] = ZERO; + b[407] = ZERO; +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = *(a13 + 26); + b[411] = *(a13 + 27); + b[412] = *(a13 + 28); + b[413] = *(a13 + 29); + b[414] = *(a13 + 30); + b[415] = *(a13 + 31); + + b[416] = ZERO; + b[417] = ZERO; + b[418] = ZERO; + b[419] = ZERO; + b[420] = ZERO; + b[421] = ZERO; + b[422] = ZERO; + b[423] = ZERO; + b[424] = ZERO; + b[425] = ZERO; + b[426] = ZERO; + b[427] = ZERO; + b[428] = ZERO; + b[429] = ZERO; + b[430] = ZERO; + b[431] = ZERO; + b[432] = ZERO; + b[433] = ZERO; + b[434] = ZERO; + b[435] = ZERO; + b[436] = ZERO; + b[437] = ZERO; + b[438] = ZERO; + b[439] = ZERO; + b[440] = ZERO; + b[441] = ZERO; +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = *(a14 + 28); + b[445] = *(a14 + 29); + b[446] = *(a14 + 30); + b[447] = *(a14 + 31); + + b[448] = ZERO; + b[449] = ZERO; + b[450] = ZERO; + b[451] = ZERO; + b[452] = ZERO; + b[453] = ZERO; + b[454] = ZERO; + b[455] = ZERO; + b[456] = ZERO; + b[457] = ZERO; + b[458] = ZERO; + b[459] = ZERO; + b[460] = ZERO; + b[461] = ZERO; + b[462] = ZERO; + b[463] = ZERO; + b[464] = ZERO; + b[465] = ZERO; + b[466] = ZERO; + b[467] = ZERO; + b[468] = ZERO; + b[469] = ZERO; + b[470] = ZERO; + b[471] = ZERO; + b[472] = ZERO; + b[473] = ZERO; + b[474] = ZERO; + b[475] = ZERO; +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = *(a15 + 30); + b[479] = *(a15 + 31); + + b[480] = ZERO; + b[481] = ZERO; + b[482] = ZERO; + b[483] = ZERO; + b[484] = ZERO; + b[485] = ZERO; + b[486] = ZERO; + b[487] = ZERO; + b[488] = ZERO; + b[489] = ZERO; + b[490] = ZERO; + b[491] = ZERO; + b[492] = ZERO; + b[493] = ZERO; + b[494] = ZERO; + b[495] = ZERO; + b[496] = ZERO; + b[497] = ZERO; + b[498] = ZERO; + b[499] = ZERO; + b[500] = ZERO; + b[501] = ZERO; + b[502] = ZERO; + b[503] = ZERO; + b[504] = ZERO; + b[505] = ZERO; + b[506] = ZERO; + b[507] = ZERO; + b[508] = ZERO; + b[509] = ZERO; +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 32; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + b += 32; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[ 10] = *(a02 + 10); + b[ 11] = *(a02 + 11); + b[ 12] = *(a02 + 12); + b[ 13] = *(a02 + 13); + b[ 14] = *(a02 + 14); + b[ 15] = *(a02 + 15); + b[ 16] = *(a02 + 16); + b[ 17] = *(a02 + 17); + b[ 18] = *(a02 + 18); + b[ 19] = *(a02 + 19); + b[ 20] = *(a02 + 20); + b[ 21] = *(a02 + 21); + b[ 22] = *(a02 + 22); + b[ 23] = *(a02 + 23); + b[ 24] = *(a02 + 24); + b[ 25] = *(a02 + 25); + b[ 26] = *(a02 + 26); + b[ 27] = *(a02 + 27); + b[ 28] = *(a02 + 28); + b[ 29] = *(a02 + 29); + b[ 30] = *(a02 + 30); + b[ 31] = *(a02 + 31); + b += 32; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[ 10] = *(a03 + 10); + b[ 11] = *(a03 + 11); + b[ 12] = *(a03 + 12); + b[ 13] = *(a03 + 13); + b[ 14] = *(a03 + 14); + b[ 15] = *(a03 + 15); + b[ 16] = *(a03 + 16); + b[ 17] = *(a03 + 17); + b[ 18] = *(a03 + 18); + b[ 19] = *(a03 + 19); + b[ 20] = *(a03 + 20); + b[ 21] = *(a03 + 21); + b[ 22] = *(a03 + 22); + b[ 23] = *(a03 + 23); + b[ 24] = *(a03 + 24); + b[ 25] = *(a03 + 25); + b[ 26] = *(a03 + 26); + b[ 27] = *(a03 + 27); + b[ 28] = *(a03 + 28); + b[ 29] = *(a03 + 29); + b[ 30] = *(a03 + 30); + b[ 31] = *(a03 + 31); + b += 32; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[ 10] = *(a04 + 10); + b[ 11] = *(a04 + 11); + b[ 12] = *(a04 + 12); + b[ 13] = *(a04 + 13); + b[ 14] = *(a04 + 14); + b[ 15] = *(a04 + 15); + b[ 16] = *(a04 + 16); + b[ 17] = *(a04 + 17); + b[ 18] = *(a04 + 18); + b[ 19] = *(a04 + 19); + b[ 20] = *(a04 + 20); + b[ 21] = *(a04 + 21); + b[ 22] = *(a04 + 22); + b[ 23] = *(a04 + 23); + b[ 24] = *(a04 + 24); + b[ 25] = *(a04 + 25); + b[ 26] = *(a04 + 26); + b[ 27] = *(a04 + 27); + b[ 28] = *(a04 + 28); + b[ 29] = *(a04 + 29); + b[ 30] = *(a04 + 30); + b[ 31] = *(a04 + 31); + b += 32; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = *(a05 + 10); + b[ 11] = *(a05 + 11); + b[ 12] = *(a05 + 12); + b[ 13] = *(a05 + 13); + b[ 14] = *(a05 + 14); + b[ 15] = *(a05 + 15); + b[ 16] = *(a05 + 16); + b[ 17] = *(a05 + 17); + b[ 18] = *(a05 + 18); + b[ 19] = *(a05 + 19); + b[ 20] = *(a05 + 20); + b[ 21] = *(a05 + 21); + b[ 22] = *(a05 + 22); + b[ 23] = *(a05 + 23); + b[ 24] = *(a05 + 24); + b[ 25] = *(a05 + 25); + b[ 26] = *(a05 + 26); + b[ 27] = *(a05 + 27); + b[ 28] = *(a05 + 28); + b[ 29] = *(a05 + 29); + b[ 30] = *(a05 + 30); + b[ 31] = *(a05 + 31); + b += 32; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = *(a06 + 12); + b[ 13] = *(a06 + 13); + b[ 14] = *(a06 + 14); + b[ 15] = *(a06 + 15); + b[ 16] = *(a06 + 16); + b[ 17] = *(a06 + 17); + b[ 18] = *(a06 + 18); + b[ 19] = *(a06 + 19); + b[ 20] = *(a06 + 20); + b[ 21] = *(a06 + 21); + b[ 22] = *(a06 + 22); + b[ 23] = *(a06 + 23); + b[ 24] = *(a06 + 24); + b[ 25] = *(a06 + 25); + b[ 26] = *(a06 + 26); + b[ 27] = *(a06 + 27); + b[ 28] = *(a06 + 28); + b[ 29] = *(a06 + 29); + b[ 30] = *(a06 + 30); + b[ 31] = *(a06 + 31); + b += 32; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = *(a07 + 14); + b[ 15] = *(a07 + 15); + b[ 16] = *(a07 + 16); + b[ 17] = *(a07 + 17); + b[ 18] = *(a07 + 18); + b[ 19] = *(a07 + 19); + b[ 20] = *(a07 + 20); + b[ 21] = *(a07 + 21); + b[ 22] = *(a07 + 22); + b[ 23] = *(a07 + 23); + b[ 24] = *(a07 + 24); + b[ 25] = *(a07 + 25); + b[ 26] = *(a07 + 26); + b[ 27] = *(a07 + 27); + b[ 28] = *(a07 + 28); + b[ 29] = *(a07 + 29); + b[ 30] = *(a07 + 30); + b[ 31] = *(a07 + 31); + b += 32; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = *(a08 + 16); + b[ 17] = *(a08 + 17); + b[ 18] = *(a08 + 18); + b[ 19] = *(a08 + 19); + b[ 20] = *(a08 + 20); + b[ 21] = *(a08 + 21); + b[ 22] = *(a08 + 22); + b[ 23] = *(a08 + 23); + b[ 24] = *(a08 + 24); + b[ 25] = *(a08 + 25); + b[ 26] = *(a08 + 26); + b[ 27] = *(a08 + 27); + b[ 28] = *(a08 + 28); + b[ 29] = *(a08 + 29); + b[ 30] = *(a08 + 30); + b[ 31] = *(a08 + 31); + b += 32; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = *(a09 + 18); + b[ 19] = *(a09 + 19); + b[ 20] = *(a09 + 20); + b[ 21] = *(a09 + 21); + b[ 22] = *(a09 + 22); + b[ 23] = *(a09 + 23); + b[ 24] = *(a09 + 24); + b[ 25] = *(a09 + 25); + b[ 26] = *(a09 + 26); + b[ 27] = *(a09 + 27); + b[ 28] = *(a09 + 28); + b[ 29] = *(a09 + 29); + b[ 30] = *(a09 + 30); + b[ 31] = *(a09 + 31); + b += 32; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = *(a10 + 20); + b[ 21] = *(a10 + 21); + b[ 22] = *(a10 + 22); + b[ 23] = *(a10 + 23); + b[ 24] = *(a10 + 24); + b[ 25] = *(a10 + 25); + b[ 26] = *(a10 + 26); + b[ 27] = *(a10 + 27); + b[ 28] = *(a10 + 28); + b[ 29] = *(a10 + 29); + b[ 30] = *(a10 + 30); + b[ 31] = *(a10 + 31); + b += 32; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = *(a11 + 22); + b[ 23] = *(a11 + 23); + b[ 24] = *(a11 + 24); + b[ 25] = *(a11 + 25); + b[ 26] = *(a11 + 26); + b[ 27] = *(a11 + 27); + b[ 28] = *(a11 + 28); + b[ 29] = *(a11 + 29); + b[ 30] = *(a11 + 30); + b[ 31] = *(a11 + 31); + b += 32; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = *(a12 + 24); + b[ 25] = *(a12 + 25); + b[ 26] = *(a12 + 26); + b[ 27] = *(a12 + 27); + b[ 28] = *(a12 + 28); + b[ 29] = *(a12 + 29); + b[ 30] = *(a12 + 30); + b[ 31] = *(a12 + 31); + b += 32; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = *(a13 + 26); + b[ 27] = *(a13 + 27); + b[ 28] = *(a13 + 28); + b[ 29] = *(a12 + 29); + b[ 30] = *(a13 + 30); + b[ 31] = *(a13 + 31); + b += 32; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = *(a14 + 28); + b[ 29] = *(a14 + 29); + b[ 30] = *(a14 + 30); + b[ 31] = *(a14 + 31); + b += 32; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = *(a15 + 30); + b[ 31] = *(a15 + 31); + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a02 + 4); + b[ 21] = *(a02 + 5); + b[ 22] = *(a02 + 6); + b[ 23] = *(a02 + 7); + b[ 24] = *(a02 + 8); + b[ 25] = *(a02 + 9); + b[ 26] = *(a02 + 10); + b[ 27] = *(a02 + 11); + b[ 28] = *(a02 + 12); + b[ 29] = *(a02 + 13); + b[ 30] = *(a02 + 14); + b[ 31] = *(a02 + 15); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a03 + 6); + b[ 39] = *(a03 + 7); + b[ 40] = *(a03 + 8); + b[ 41] = *(a03 + 9); + b[ 42] = *(a03 + 10); + b[ 43] = *(a03 + 11); + b[ 44] = *(a03 + 12); + b[ 45] = *(a03 + 13); + b[ 46] = *(a03 + 14); + b[ 47] = *(a03 + 15); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a04 + 8); + b[ 57] = *(a04 + 9); + b[ 58] = *(a04 + 10); + b[ 59] = *(a04 + 11); + b[ 60] = *(a04 + 12); + b[ 61] = *(a04 + 13); + b[ 62] = *(a04 + 14); + b[ 63] = *(a04 + 15); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a05 + 10); + b[ 75] = *(a05 + 11); + b[ 76] = *(a05 + 12); + b[ 77] = *(a05 + 13); + b[ 78] = *(a05 + 14); + b[ 79] = *(a05 + 15); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a06 + 12); + b[ 93] = *(a06 + 13); + b[ 94] = *(a06 + 14); + b[ 95] = *(a06 + 15); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a07 + 14); + b[111] = *(a07 + 15); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; */ + b += 16 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[10] = *(a02 + 10); + b[11] = *(a02 + 11); + b[12] = *(a02 + 12); + b[13] = *(a02 + 13); + b[14] = *(a02 + 14); + b[15] = *(a02 + 15); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[10] = *(a03 + 10); + b[11] = *(a03 + 11); + b[12] = *(a03 + 12); + b[13] = *(a03 + 13); + b[14] = *(a03 + 14); + b[15] = *(a03 + 15); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[10] = *(a04 + 10); + b[11] = *(a04 + 11); + b[12] = *(a04 + 12); + b[13] = *(a04 + 13); + b[14] = *(a04 + 14); + b[15] = *(a04 + 15); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a05 + 10); + b[11] = *(a05 + 11); + b[12] = *(a05 + 12); + b[13] = *(a05 + 13); + b[14] = *(a05 + 14); + b[15] = *(a05 + 15); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a06 + 12); + b[13] = *(a06 + 13); + b[14] = *(a06 + 14); + b[15] = *(a06 + 15); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a07 + 14); + b[15] = *(a07 + 15); + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a02 + 4); + b[ 13] = *(a02 + 5); + b[ 14] = *(a02 + 6); + b[ 15] = *(a02 + 7); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a03 + 6); + b[ 23] = *(a03 + 7); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X > posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; */ + b += 8 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b += 8; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + b[4] = *(a02 + 0); + b[5] = *(a02 + 1); + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + + b[4] = ZERO; + b[5] = ZERO; +#ifdef UNIT + b[6] = ONE; + b[7] = ZERO; +#else + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X > posY) { + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + /* a01 += lda; + a02 += lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + a01 += 2; + b += 2; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + // posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_16.c b/kernel/generic/ztrmm_uncopy_16.c new file mode 100644 index 000000000..40b85db38 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_16.c @@ -0,0 +1,2316 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X > posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 512; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = *(a03 + 2); + b[ 37] = *(a03 + 3); + b[ 38] = *(a04 + 2); + b[ 39] = *(a04 + 3); + b[ 40] = *(a05 + 2); + b[ 41] = *(a05 + 3); + b[ 42] = *(a06 + 2); + b[ 43] = *(a06 + 3); + b[ 44] = *(a07 + 2); + b[ 45] = *(a07 + 3); + b[ 46] = *(a08 + 2); + b[ 47] = *(a08 + 3); + b[ 48] = *(a09 + 2); + b[ 49] = *(a09 + 3); + b[ 50] = *(a10 + 2); + b[ 51] = *(a10 + 3); + b[ 52] = *(a11 + 2); + b[ 53] = *(a11 + 3); + b[ 54] = *(a12 + 2); + b[ 55] = *(a12 + 3); + b[ 56] = *(a13 + 2); + b[ 57] = *(a13 + 3); + b[ 58] = *(a14 + 2); + b[ 59] = *(a14 + 3); + b[ 60] = *(a15 + 2); + b[ 61] = *(a15 + 3); + b[ 62] = *(a16 + 2); + b[ 63] = *(a16 + 3); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = *(a04 + 4); + b[ 71] = *(a04 + 5); + b[ 72] = *(a05 + 4); + b[ 73] = *(a05 + 5); + b[ 74] = *(a06 + 4); + b[ 75] = *(a06 + 5); + b[ 76] = *(a07 + 4); + b[ 77] = *(a07 + 5); + b[ 78] = *(a08 + 4); + b[ 79] = *(a08 + 5); + b[ 80] = *(a09 + 4); + b[ 81] = *(a09 + 5); + b[ 82] = *(a10 + 4); + b[ 83] = *(a10 + 5); + b[ 84] = *(a11 + 4); + b[ 85] = *(a11 + 5); + b[ 86] = *(a12 + 4); + b[ 87] = *(a12 + 5); + b[ 88] = *(a13 + 4); + b[ 89] = *(a13 + 5); + b[ 90] = *(a14 + 4); + b[ 91] = *(a14 + 5); + b[ 92] = *(a15 + 4); + b[ 93] = *(a15 + 5); + b[ 94] = *(a16 + 4); + b[ 95] = *(a16 + 5); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = *(a05 + 6); + b[105] = *(a05 + 7); + b[106] = *(a06 + 6); + b[107] = *(a06 + 7); + b[108] = *(a07 + 6); + b[109] = *(a07 + 7); + b[110] = *(a08 + 6); + b[111] = *(a08 + 7); + b[112] = *(a09 + 6); + b[113] = *(a09 + 7); + b[114] = *(a10 + 6); + b[115] = *(a10 + 7); + b[116] = *(a11 + 6); + b[117] = *(a11 + 7); + b[118] = *(a12 + 6); + b[119] = *(a12 + 7); + b[120] = *(a13 + 6); + b[121] = *(a13 + 7); + b[122] = *(a14 + 6); + b[123] = *(a14 + 7); + b[124] = *(a15 + 6); + b[125] = *(a15 + 7); + b[126] = *(a16 + 6); + b[127] = *(a16 + 7); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = *(a06 + 8); + b[139] = *(a06 + 9); + b[140] = *(a07 + 8); + b[141] = *(a07 + 9); + b[142] = *(a08 + 8); + b[143] = *(a08 + 9); + b[144] = *(a09 + 8); + b[145] = *(a09 + 9); + b[146] = *(a10 + 8); + b[147] = *(a10 + 9); + b[148] = *(a11 + 8); + b[149] = *(a11 + 9); + b[150] = *(a12 + 8); + b[151] = *(a12 + 9); + b[152] = *(a13 + 8); + b[153] = *(a13 + 9); + b[154] = *(a14 + 8); + b[155] = *(a14 + 9); + b[156] = *(a15 + 8); + b[157] = *(a15 + 9); + b[158] = *(a16 + 8); + b[159] = *(a16 + 9); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = *(a07 + 10); + b[173] = *(a07 + 11); + b[174] = *(a08 + 10); + b[175] = *(a08 + 11); + b[176] = *(a09 + 10); + b[177] = *(a09 + 11); + b[178] = *(a10 + 10); + b[179] = *(a10 + 11); + b[180] = *(a11 + 10); + b[181] = *(a11 + 11); + b[182] = *(a12 + 10); + b[183] = *(a12 + 11); + b[184] = *(a13 + 10); + b[185] = *(a13 + 11); + b[186] = *(a14 + 10); + b[187] = *(a14 + 11); + b[188] = *(a15 + 10); + b[189] = *(a15 + 11); + b[190] = *(a16 + 10); + b[191] = *(a16 + 11); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = *(a08 + 12); + b[207] = *(a08 + 13); + b[208] = *(a09 + 12); + b[209] = *(a09 + 13); + b[210] = *(a10 + 12); + b[211] = *(a10 + 13); + b[212] = *(a11 + 12); + b[213] = *(a11 + 13); + b[214] = *(a12 + 12); + b[215] = *(a12 + 13); + b[216] = *(a13 + 12); + b[217] = *(a13 + 13); + b[218] = *(a14 + 12); + b[219] = *(a14 + 13); + b[220] = *(a15 + 12); + b[221] = *(a15 + 13); + b[222] = *(a16 + 12); + b[223] = *(a16 + 13); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = *(a09 + 14); + b[241] = *(a09 + 15); + b[242] = *(a10 + 14); + b[243] = *(a10 + 15); + b[244] = *(a11 + 14); + b[245] = *(a11 + 15); + b[246] = *(a12 + 14); + b[247] = *(a12 + 15); + b[248] = *(a13 + 14); + b[249] = *(a13 + 15); + b[250] = *(a14 + 14); + b[251] = *(a14 + 15); + b[252] = *(a15 + 14); + b[253] = *(a15 + 15); + b[254] = *(a16 + 14); + b[255] = *(a16 + 15); + + b[256] = ZERO; + b[257] = ZERO; + b[258] = ZERO; + b[259] = ZERO; + b[260] = ZERO; + b[261] = ZERO; + b[262] = ZERO; + b[263] = ZERO; + b[264] = ZERO; + b[265] = ZERO; + b[266] = ZERO; + b[267] = ZERO; + b[268] = ZERO; + b[269] = ZERO; + b[270] = ZERO; + b[271] = ZERO; +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = *(a10 + 16); + b[275] = *(a10 + 17); + b[276] = *(a11 + 16); + b[277] = *(a11 + 17); + b[278] = *(a12 + 16); + b[279] = *(a12 + 17); + b[280] = *(a13 + 16); + b[281] = *(a13 + 17); + b[282] = *(a14 + 16); + b[283] = *(a14 + 17); + b[284] = *(a15 + 16); + b[285] = *(a15 + 17); + b[286] = *(a16 + 16); + b[287] = *(a16 + 17); + + b[288] = ZERO; + b[289] = ZERO; + b[290] = ZERO; + b[291] = ZERO; + b[292] = ZERO; + b[293] = ZERO; + b[294] = ZERO; + b[295] = ZERO; + b[296] = ZERO; + b[297] = ZERO; + b[298] = ZERO; + b[299] = ZERO; + b[300] = ZERO; + b[301] = ZERO; + b[302] = ZERO; + b[303] = ZERO; + b[304] = ZERO; + b[305] = ZERO; +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = *(a11 + 18); + b[309] = *(a11 + 19); + b[310] = *(a12 + 18); + b[311] = *(a12 + 19); + b[312] = *(a13 + 18); + b[313] = *(a13 + 19); + b[314] = *(a14 + 18); + b[315] = *(a14 + 19); + b[316] = *(a15 + 18); + b[317] = *(a15 + 19); + b[318] = *(a16 + 18); + b[319] = *(a16 + 19); + + b[320] = ZERO; + b[321] = ZERO; + b[322] = ZERO; + b[323] = ZERO; + b[324] = ZERO; + b[325] = ZERO; + b[326] = ZERO; + b[327] = ZERO; + b[328] = ZERO; + b[329] = ZERO; + b[330] = ZERO; + b[331] = ZERO; + b[332] = ZERO; + b[333] = ZERO; + b[334] = ZERO; + b[335] = ZERO; + b[336] = ZERO; + b[337] = ZERO; + b[338] = ZERO; + b[339] = ZERO; +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = *(a12 + 20); + b[343] = *(a12 + 21); + b[344] = *(a13 + 20); + b[345] = *(a13 + 21); + b[346] = *(a14 + 20); + b[347] = *(a14 + 21); + b[348] = *(a15 + 20); + b[349] = *(a15 + 21); + b[350] = *(a16 + 20); + b[351] = *(a16 + 21); + + b[352] = ZERO; + b[353] = ZERO; + b[354] = ZERO; + b[355] = ZERO; + b[356] = ZERO; + b[357] = ZERO; + b[358] = ZERO; + b[359] = ZERO; + b[360] = ZERO; + b[361] = ZERO; + b[362] = ZERO; + b[363] = ZERO; + b[364] = ZERO; + b[365] = ZERO; + b[366] = ZERO; + b[367] = ZERO; + b[368] = ZERO; + b[369] = ZERO; + b[370] = ZERO; + b[371] = ZERO; + b[372] = ZERO; + b[373] = ZERO; +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = *(a13 + 22); + b[377] = *(a13 + 23); + b[378] = *(a14 + 22); + b[379] = *(a14 + 23); + b[380] = *(a15 + 22); + b[381] = *(a15 + 23); + b[382] = *(a16 + 22); + b[383] = *(a16 + 23); + + b[384] = ZERO; + b[385] = ZERO; + b[386] = ZERO; + b[387] = ZERO; + b[388] = ZERO; + b[389] = ZERO; + b[390] = ZERO; + b[391] = ZERO; + b[392] = ZERO; + b[393] = ZERO; + b[394] = ZERO; + b[395] = ZERO; + b[396] = ZERO; + b[397] = ZERO; + b[398] = ZERO; + b[399] = ZERO; + b[400] = ZERO; + b[401] = ZERO; + b[402] = ZERO; + b[403] = ZERO; + b[404] = ZERO; + b[405] = ZERO; + b[406] = ZERO; + b[407] = ZERO; +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = *(a14 + 24); + b[411] = *(a14 + 25); + b[412] = *(a15 + 24); + b[413] = *(a15 + 25); + b[414] = *(a16 + 24); + b[415] = *(a16 + 25); + + b[416] = ZERO; + b[417] = ZERO; + b[418] = ZERO; + b[419] = ZERO; + b[420] = ZERO; + b[421] = ZERO; + b[422] = ZERO; + b[423] = ZERO; + b[424] = ZERO; + b[425] = ZERO; + b[426] = ZERO; + b[427] = ZERO; + b[428] = ZERO; + b[429] = ZERO; + b[430] = ZERO; + b[431] = ZERO; + b[432] = ZERO; + b[433] = ZERO; + b[434] = ZERO; + b[435] = ZERO; + b[436] = ZERO; + b[437] = ZERO; + b[438] = ZERO; + b[439] = ZERO; + b[440] = ZERO; + b[441] = ZERO; +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = *(a15 + 26); + b[445] = *(a15 + 27); + b[446] = *(a16 + 26); + b[447] = *(a16 + 27); + + b[448] = ZERO; + b[449] = ZERO; + b[450] = ZERO; + b[451] = ZERO; + b[452] = ZERO; + b[453] = ZERO; + b[454] = ZERO; + b[455] = ZERO; + b[456] = ZERO; + b[457] = ZERO; + b[458] = ZERO; + b[459] = ZERO; + b[460] = ZERO; + b[461] = ZERO; + b[462] = ZERO; + b[463] = ZERO; + b[464] = ZERO; + b[465] = ZERO; + b[466] = ZERO; + b[467] = ZERO; + b[468] = ZERO; + b[469] = ZERO; + b[470] = ZERO; + b[471] = ZERO; + b[472] = ZERO; + b[473] = ZERO; + b[474] = ZERO; + b[475] = ZERO; +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = *(a16 + 28); + b[479] = *(a16 + 29); + + b[480] = ZERO; + b[481] = ZERO; + b[482] = ZERO; + b[483] = ZERO; + b[484] = ZERO; + b[485] = ZERO; + b[486] = ZERO; + b[487] = ZERO; + b[488] = ZERO; + b[489] = ZERO; + b[490] = ZERO; + b[491] = ZERO; + b[492] = ZERO; + b[493] = ZERO; + b[494] = ZERO; + b[495] = ZERO; + b[496] = ZERO; + b[497] = ZERO; + b[498] = ZERO; + b[499] = ZERO; + b[500] = ZERO; + b[501] = ZERO; + b[502] = ZERO; + b[503] = ZERO; + b[504] = ZERO; + b[505] = ZERO; + b[506] = ZERO; + b[507] = ZERO; + b[508] = ZERO; + b[509] = ZERO; +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + b += 32; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b[ 8] = *(a05 + 2); + b[ 9] = *(a05 + 3); + b[ 10] = *(a06 + 2); + b[ 11] = *(a06 + 3); + b[ 12] = *(a07 + 2); + b[ 13] = *(a07 + 3); + b[ 14] = *(a08 + 2); + b[ 15] = *(a08 + 3); + b[ 16] = *(a09 + 2); + b[ 17] = *(a09 + 3); + b[ 18] = *(a10 + 2); + b[ 19] = *(a10 + 3); + b[ 20] = *(a11 + 2); + b[ 21] = *(a11 + 3); + b[ 22] = *(a12 + 2); + b[ 23] = *(a12 + 3); + b[ 24] = *(a13 + 2); + b[ 25] = *(a13 + 3); + b[ 26] = *(a14 + 2); + b[ 27] = *(a14 + 3); + b[ 28] = *(a15 + 2); + b[ 29] = *(a15 + 3); + b[ 30] = *(a16 + 2); + b[ 31] = *(a16 + 3); + b += 32; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b[ 8] = *(a05 + 4); + b[ 9] = *(a05 + 5); + b[ 10] = *(a06 + 4); + b[ 11] = *(a06 + 5); + b[ 12] = *(a07 + 4); + b[ 13] = *(a07 + 5); + b[ 14] = *(a08 + 4); + b[ 15] = *(a08 + 5); + b[ 16] = *(a09 + 4); + b[ 17] = *(a09 + 5); + b[ 18] = *(a10 + 4); + b[ 19] = *(a10 + 5); + b[ 20] = *(a11 + 4); + b[ 21] = *(a11 + 5); + b[ 22] = *(a12 + 4); + b[ 23] = *(a12 + 5); + b[ 24] = *(a13 + 4); + b[ 25] = *(a13 + 5); + b[ 26] = *(a14 + 4); + b[ 27] = *(a14 + 5); + b[ 28] = *(a15 + 4); + b[ 29] = *(a15 + 5); + b[ 30] = *(a16 + 4); + b[ 31] = *(a16 + 5); + b += 32; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a05 + 6); + b[ 9] = *(a05 + 7); + b[ 10] = *(a06 + 6); + b[ 11] = *(a06 + 7); + b[ 12] = *(a07 + 6); + b[ 13] = *(a07 + 7); + b[ 14] = *(a08 + 6); + b[ 15] = *(a08 + 7); + b[ 16] = *(a09 + 6); + b[ 17] = *(a09 + 7); + b[ 18] = *(a10 + 6); + b[ 19] = *(a10 + 7); + b[ 20] = *(a11 + 6); + b[ 21] = *(a11 + 7); + b[ 22] = *(a12 + 6); + b[ 23] = *(a12 + 7); + b[ 24] = *(a13 + 6); + b[ 25] = *(a13 + 7); + b[ 26] = *(a14 + 6); + b[ 27] = *(a14 + 7); + b[ 28] = *(a15 + 6); + b[ 29] = *(a15 + 7); + b[ 30] = *(a16 + 6); + b[ 31] = *(a16 + 7); + b += 32; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = *(a06 + 8); + b[ 11] = *(a06 + 9); + b[ 12] = *(a07 + 8); + b[ 13] = *(a07 + 9); + b[ 14] = *(a08 + 8); + b[ 15] = *(a08 + 9); + b[ 16] = *(a09 + 8); + b[ 17] = *(a09 + 9); + b[ 18] = *(a10 + 8); + b[ 19] = *(a10 + 9); + b[ 20] = *(a11 + 8); + b[ 21] = *(a11 + 9); + b[ 22] = *(a12 + 8); + b[ 23] = *(a12 + 9); + b[ 24] = *(a13 + 8); + b[ 25] = *(a13 + 9); + b[ 26] = *(a14 + 8); + b[ 27] = *(a14 + 9); + b[ 28] = *(a15 + 8); + b[ 29] = *(a15 + 9); + b[ 30] = *(a16 + 8); + b[ 31] = *(a16 + 9); + b += 32; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = *(a07 + 10); + b[ 13] = *(a07 + 11); + b[ 14] = *(a08 + 10); + b[ 15] = *(a08 + 11); + b[ 16] = *(a09 + 10); + b[ 17] = *(a09 + 11); + b[ 18] = *(a10 + 10); + b[ 19] = *(a10 + 11); + b[ 20] = *(a11 + 10); + b[ 21] = *(a11 + 11); + b[ 22] = *(a12 + 10); + b[ 23] = *(a12 + 11); + b[ 24] = *(a13 + 10); + b[ 25] = *(a13 + 11); + b[ 26] = *(a14 + 10); + b[ 27] = *(a14 + 11); + b[ 28] = *(a15 + 10); + b[ 29] = *(a15 + 11); + b[ 30] = *(a16 + 10); + b[ 31] = *(a16 + 11); + b += 32; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = *(a08 + 12); + b[ 15] = *(a08 + 13); + b[ 16] = *(a09 + 12); + b[ 17] = *(a09 + 13); + b[ 18] = *(a10 + 12); + b[ 19] = *(a10 + 13); + b[ 20] = *(a11 + 12); + b[ 21] = *(a11 + 13); + b[ 22] = *(a12 + 12); + b[ 23] = *(a12 + 13); + b[ 24] = *(a13 + 12); + b[ 25] = *(a13 + 13); + b[ 26] = *(a14 + 12); + b[ 27] = *(a14 + 13); + b[ 28] = *(a15 + 12); + b[ 29] = *(a15 + 13); + b[ 30] = *(a16 + 12); + b[ 31] = *(a16 + 13); + b += 32; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = *(a09 + 14); + b[ 17] = *(a09 + 15); + b[ 18] = *(a10 + 14); + b[ 19] = *(a10 + 15); + b[ 20] = *(a11 + 14); + b[ 21] = *(a11 + 15); + b[ 22] = *(a12 + 14); + b[ 23] = *(a12 + 15); + b[ 24] = *(a13 + 14); + b[ 25] = *(a13 + 15); + b[ 26] = *(a14 + 14); + b[ 27] = *(a14 + 15); + b[ 28] = *(a15 + 14); + b[ 29] = *(a15 + 15); + b[ 30] = *(a16 + 14); + b[ 31] = *(a16 + 15); + b += 32; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = *(a10 + 16); + b[ 19] = *(a10 + 17); + b[ 20] = *(a11 + 16); + b[ 21] = *(a11 + 17); + b[ 22] = *(a12 + 16); + b[ 23] = *(a12 + 17); + b[ 24] = *(a13 + 16); + b[ 25] = *(a13 + 17); + b[ 26] = *(a14 + 16); + b[ 27] = *(a14 + 17); + b[ 28] = *(a15 + 16); + b[ 29] = *(a15 + 17); + b[ 30] = *(a16 + 16); + b[ 31] = *(a16 + 17); + b += 32; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = *(a11 + 18); + b[ 21] = *(a11 + 19); + b[ 22] = *(a12 + 18); + b[ 23] = *(a12 + 19); + b[ 24] = *(a13 + 18); + b[ 25] = *(a13 + 19); + b[ 26] = *(a14 + 18); + b[ 27] = *(a14 + 19); + b[ 28] = *(a15 + 18); + b[ 29] = *(a15 + 19); + b[ 30] = *(a16 + 18); + b[ 31] = *(a16 + 19); + b += 32; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = *(a12 + 20); + b[ 23] = *(a12 + 21); + b[ 24] = *(a13 + 20); + b[ 25] = *(a13 + 21); + b[ 26] = *(a14 + 20); + b[ 27] = *(a14 + 21); + b[ 28] = *(a15 + 20); + b[ 29] = *(a15 + 21); + b[ 30] = *(a16 + 20); + b[ 31] = *(a16 + 21); + b += 32; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = *(a13 + 22); + b[ 25] = *(a13 + 23); + b[ 26] = *(a14 + 22); + b[ 27] = *(a14 + 23); + b[ 28] = *(a15 + 22); + b[ 29] = *(a15 + 23); + b[ 30] = *(a16 + 22); + b[ 31] = *(a16 + 23); + b += 32; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = *(a14 + 24); + b[ 27] = *(a14 + 25); + b[ 28] = *(a15 + 24); + b[ 29] = *(a15 + 25); + b[ 30] = *(a16 + 24); + b[ 31] = *(a16 + 25); + b += 32; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = *(a15 + 26); + b[ 29] = *(a15 + 27); + b[ 30] = *(a16 + 26); + b[ 31] = *(a16 + 27); + b += 32; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = *(a16 + 28); + b[ 31] = *(a16 + 29); + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X > posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 128; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a03 + 2); + b[ 21] = *(a03 + 3); + b[ 22] = *(a04 + 2); + b[ 23] = *(a04 + 3); + b[ 24] = *(a05 + 2); + b[ 25] = *(a05 + 3); + b[ 26] = *(a06 + 2); + b[ 27] = *(a06 + 3); + b[ 28] = *(a07 + 2); + b[ 29] = *(a07 + 3); + b[ 30] = *(a08 + 2); + b[ 31] = *(a08 + 3); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a04 + 4); + b[ 39] = *(a04 + 5); + b[ 40] = *(a05 + 4); + b[ 41] = *(a05 + 5); + b[ 42] = *(a06 + 4); + b[ 43] = *(a06 + 5); + b[ 44] = *(a07 + 4); + b[ 45] = *(a07 + 5); + b[ 46] = *(a08 + 4); + b[ 47] = *(a08 + 5); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a05 + 6); + b[ 57] = *(a05 + 7); + b[ 58] = *(a06 + 6); + b[ 59] = *(a06 + 7); + b[ 60] = *(a07 + 6); + b[ 61] = *(a07 + 7); + b[ 62] = *(a08 + 6); + b[ 63] = *(a08 + 7); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a06 + 8); + b[ 75] = *(a06 + 9); + b[ 76] = *(a07 + 8); + b[ 77] = *(a07 + 9); + b[ 78] = *(a08 + 8); + b[ 79] = *(a08 + 9); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a07 + 10); + b[ 93] = *(a07 + 11); + b[ 94] = *(a08 + 10); + b[ 95] = *(a08 + 11); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a08 + 12); + b[111] = *(a08 + 13); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; */ + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[10] = *(a06 + 0); + b[11] = *(a06 + 1); + b[12] = *(a07 + 0); + b[13] = *(a07 + 1); + b[14] = *(a08 + 0); + b[15] = *(a08 + 1); + b += 16; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b[ 8] = *(a05 + 2); + b[ 9] = *(a05 + 3); + b[10] = *(a06 + 2); + b[11] = *(a06 + 3); + b[12] = *(a07 + 2); + b[13] = *(a07 + 3); + b[14] = *(a08 + 2); + b[15] = *(a08 + 3); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b[ 8] = *(a05 + 4); + b[ 9] = *(a05 + 5); + b[10] = *(a06 + 4); + b[11] = *(a06 + 5); + b[12] = *(a07 + 4); + b[13] = *(a07 + 5); + b[14] = *(a08 + 4); + b[15] = *(a08 + 5); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a05 + 6); + b[ 9] = *(a05 + 7); + b[10] = *(a06 + 6); + b[11] = *(a06 + 7); + b[12] = *(a07 + 6); + b[13] = *(a07 + 7); + b[14] = *(a08 + 6); + b[15] = *(a08 + 7); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a06 + 8); + b[11] = *(a06 + 9); + b[12] = *(a07 + 8); + b[13] = *(a07 + 9); + b[14] = *(a08 + 8); + b[15] = *(a08 + 9); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a07 + 10); + b[13] = *(a07 + 11); + b[14] = *(a08 + 10); + b[15] = *(a08 + 11); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a08 + 12); + b[15] = *(a08 + 13); + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X > posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a03 + 2); + b[ 13] = *(a03 + 3); + b[ 14] = *(a04 + 2); + b[ 15] = *(a04 + 3); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a04 + 4); + b[ 23] = *(a04 + 5); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; */ + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X > posY) { + /* a01 += 2 * lda; + a02 += 2 * lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += 2; + b += 2; + } else + if (X > posY) { + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += lda; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_16.c b/kernel/generic/ztrmm_utcopy_16.c new file mode 100644 index 000000000..5aba3727a --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_16.c @@ -0,0 +1,2318 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + + b += 512; + } else + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + b += 32; + } + + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a02 + 0); + b[ 33] = *(a02 + 1); +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a03 + 0); + b[ 65] = *(a03 + 1); + b[ 66] = *(a03 + 2); + b[ 67] = *(a03 + 3); +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a04 + 0); + b[ 97] = *(a04 + 1); + b[ 98] = *(a04 + 2); + b[ 99] = *(a04 + 3); + b[100] = *(a04 + 4); + b[101] = *(a04 + 5); +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a05 + 0); + b[129] = *(a05 + 1); + b[130] = *(a05 + 2); + b[131] = *(a05 + 3); + b[132] = *(a05 + 4); + b[133] = *(a05 + 5); + b[134] = *(a05 + 6); + b[135] = *(a05 + 7); +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; + b[153] = ZERO; + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a06 + 0); + b[161] = *(a06 + 1); + b[162] = *(a06 + 2); + b[163] = *(a06 + 3); + b[164] = *(a06 + 4); + b[165] = *(a06 + 5); + b[166] = *(a06 + 6); + b[167] = *(a06 + 7); + b[168] = *(a06 + 8); + b[169] = *(a06 + 9); +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; + b[187] = ZERO; + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a07 + 0); + b[193] = *(a07 + 1); + b[194] = *(a07 + 2); + b[195] = *(a07 + 3); + b[196] = *(a07 + 4); + b[197] = *(a07 + 5); + b[198] = *(a07 + 6); + b[199] = *(a07 + 7); + b[200] = *(a07 + 8); + b[201] = *(a07 + 9); + b[202] = *(a07 + 10); + b[203] = *(a07 + 11); +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = ZERO; + b[207] = ZERO; + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; + b[221] = ZERO; + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a08 + 0); + b[225] = *(a08 + 1); + b[226] = *(a08 + 2); + b[227] = *(a08 + 3); + b[228] = *(a08 + 4); + b[229] = *(a08 + 5); + b[230] = *(a08 + 6); + b[231] = *(a08 + 7); + b[232] = *(a08 + 8); + b[233] = *(a08 + 9); + b[234] = *(a08 + 10); + b[235] = *(a08 + 11); + b[236] = *(a08 + 12); + b[237] = *(a08 + 13); +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; + b[255] = ZERO; + + b[256] = *(a09 + 0); + b[257] = *(a09 + 1); + b[258] = *(a09 + 2); + b[259] = *(a09 + 3); + b[260] = *(a09 + 4); + b[261] = *(a09 + 5); + b[262] = *(a09 + 6); + b[263] = *(a09 + 7); + b[264] = *(a09 + 8); + b[265] = *(a09 + 9); + b[266] = *(a09 + 10); + b[267] = *(a09 + 11); + b[268] = *(a09 + 12); + b[269] = *(a09 + 13); + b[270] = *(a09 + 14); + b[271] = *(a09 + 15); +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = ZERO; + b[275] = ZERO; + b[276] = ZERO; + b[277] = ZERO; + b[278] = ZERO; + b[279] = ZERO; + b[280] = ZERO; + b[281] = ZERO; + b[282] = ZERO; + b[283] = ZERO; + b[284] = ZERO; + b[285] = ZERO; + b[286] = ZERO; + b[287] = ZERO; + + b[288] = *(a10 + 0); + b[289] = *(a10 + 1); + b[290] = *(a10 + 2); + b[291] = *(a10 + 3); + b[292] = *(a10 + 4); + b[293] = *(a10 + 5); + b[294] = *(a10 + 6); + b[295] = *(a10 + 7); + b[296] = *(a10 + 8); + b[297] = *(a10 + 9); + b[298] = *(a10 + 10); + b[299] = *(a10 + 11); + b[300] = *(a10 + 12); + b[301] = *(a10 + 13); + b[302] = *(a10 + 14); + b[303] = *(a10 + 15); + b[304] = *(a10 + 16); + b[305] = *(a10 + 17); +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = ZERO; + b[309] = ZERO; + b[310] = ZERO; + b[311] = ZERO; + b[312] = ZERO; + b[313] = ZERO; + b[314] = ZERO; + b[315] = ZERO; + b[316] = ZERO; + b[317] = ZERO; + b[318] = ZERO; + b[319] = ZERO; + + b[320] = *(a11 + 0); + b[321] = *(a11 + 1); + b[322] = *(a11 + 2); + b[323] = *(a11 + 3); + b[324] = *(a11 + 4); + b[325] = *(a11 + 5); + b[326] = *(a11 + 6); + b[327] = *(a11 + 7); + b[328] = *(a11 + 8); + b[329] = *(a11 + 9); + b[330] = *(a11 + 10); + b[331] = *(a11 + 11); + b[332] = *(a11 + 12); + b[333] = *(a11 + 13); + b[334] = *(a11 + 14); + b[335] = *(a11 + 15); + b[336] = *(a11 + 16); + b[337] = *(a11 + 17); + b[338] = *(a11 + 18); + b[339] = *(a11 + 19); +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = ZERO; + b[343] = ZERO; + b[344] = ZERO; + b[345] = ZERO; + b[346] = ZERO; + b[347] = ZERO; + b[348] = ZERO; + b[349] = ZERO; + b[350] = ZERO; + b[351] = ZERO; + + b[352] = *(a12 + 0); + b[353] = *(a12 + 1); + b[354] = *(a12 + 2); + b[355] = *(a12 + 3); + b[356] = *(a12 + 4); + b[357] = *(a12 + 5); + b[358] = *(a12 + 6); + b[359] = *(a12 + 7); + b[360] = *(a12 + 8); + b[361] = *(a12 + 9); + b[362] = *(a12 + 10); + b[363] = *(a12 + 11); + b[364] = *(a12 + 12); + b[365] = *(a12 + 13); + b[366] = *(a12 + 14); + b[367] = *(a12 + 15); + b[368] = *(a12 + 16); + b[369] = *(a12 + 17); + b[370] = *(a12 + 18); + b[371] = *(a12 + 19); + b[372] = *(a12 + 20); + b[373] = *(a12 + 21); +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = ZERO; + b[377] = ZERO; + b[378] = ZERO; + b[379] = ZERO; + b[380] = ZERO; + b[381] = ZERO; + b[382] = ZERO; + b[383] = ZERO; + + b[384] = *(a13 + 0); + b[385] = *(a13 + 1); + b[386] = *(a13 + 2); + b[387] = *(a13 + 3); + b[388] = *(a13 + 4); + b[389] = *(a13 + 5); + b[390] = *(a13 + 6); + b[391] = *(a13 + 7); + b[392] = *(a13 + 8); + b[393] = *(a13 + 9); + b[394] = *(a13 + 10); + b[395] = *(a13 + 11); + b[396] = *(a13 + 12); + b[397] = *(a13 + 13); + b[398] = *(a13 + 14); + b[399] = *(a13 + 15); + b[400] = *(a13 + 16); + b[401] = *(a13 + 17); + b[402] = *(a13 + 18); + b[403] = *(a13 + 19); + b[404] = *(a13 + 20); + b[405] = *(a13 + 21); + b[406] = *(a13 + 22); + b[407] = *(a13 + 23); +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = ZERO; + b[411] = ZERO; + b[412] = ZERO; + b[413] = ZERO; + b[414] = ZERO; + b[415] = ZERO; + + b[416] = *(a14 + 0); + b[417] = *(a14 + 1); + b[418] = *(a14 + 2); + b[419] = *(a14 + 3); + b[420] = *(a14 + 4); + b[421] = *(a14 + 5); + b[422] = *(a14 + 6); + b[423] = *(a14 + 7); + b[424] = *(a14 + 8); + b[425] = *(a14 + 9); + b[426] = *(a14 + 10); + b[427] = *(a14 + 11); + b[428] = *(a14 + 12); + b[429] = *(a14 + 13); + b[430] = *(a14 + 14); + b[431] = *(a14 + 15); + b[432] = *(a14 + 16); + b[433] = *(a14 + 17); + b[434] = *(a14 + 18); + b[435] = *(a14 + 19); + b[436] = *(a14 + 20); + b[437] = *(a14 + 21); + b[438] = *(a14 + 22); + b[439] = *(a14 + 23); + b[440] = *(a14 + 24); + b[441] = *(a14 + 25); +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = ZERO; + b[445] = ZERO; + b[446] = ZERO; + b[447] = ZERO; + + b[448] = *(a15 + 0); + b[449] = *(a15 + 1); + b[450] = *(a15 + 2); + b[451] = *(a15 + 3); + b[452] = *(a15 + 4); + b[453] = *(a15 + 5); + b[454] = *(a15 + 6); + b[455] = *(a15 + 7); + b[456] = *(a15 + 8); + b[457] = *(a15 + 9); + b[458] = *(a15 + 10); + b[459] = *(a15 + 11); + b[460] = *(a15 + 12); + b[461] = *(a15 + 13); + b[462] = *(a15 + 14); + b[463] = *(a15 + 15); + b[464] = *(a15 + 16); + b[465] = *(a15 + 17); + b[466] = *(a15 + 18); + b[467] = *(a15 + 19); + b[468] = *(a15 + 20); + b[469] = *(a15 + 21); + b[470] = *(a15 + 22); + b[471] = *(a15 + 23); + b[472] = *(a15 + 24); + b[473] = *(a15 + 25); + b[474] = *(a15 + 26); + b[475] = *(a15 + 27); +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = ZERO; + b[479] = ZERO; + + b[480] = *(a16 + 0); + b[481] = *(a16 + 1); + b[482] = *(a16 + 2); + b[483] = *(a16 + 3); + b[484] = *(a16 + 4); + b[485] = *(a16 + 5); + b[486] = *(a16 + 6); + b[487] = *(a16 + 7); + b[488] = *(a16 + 8); + b[489] = *(a16 + 9); + b[490] = *(a16 + 10); + b[491] = *(a16 + 11); + b[492] = *(a16 + 12); + b[493] = *(a16 + 13); + b[494] = *(a16 + 14); + b[495] = *(a16 + 15); + b[496] = *(a16 + 16); + b[497] = *(a16 + 17); + b[498] = *(a16 + 18); + b[499] = *(a16 + 19); + b[500] = *(a16 + 20); + b[501] = *(a16 + 21); + b[502] = *(a16 + 22); + b[503] = *(a16 + 23); + b[504] = *(a16 + 24); + b[505] = *(a16 + 25); + b[506] = *(a16 + 26); + b[507] = *(a16 + 27); + b[508] = *(a16 + 28); + b[509] = *(a16 + 29); +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + // a01 += 2 * i; + // a02 += 2 * i; + // a03 += 2 * i; + // a04 += 2 * i; + // a05 += 2 * i; + // a06 += 2 * i; + // a07 += 2 * i; + // a08 += 2 * i; + // a09 += 2 * i; + // a10 += 2 * i; + // a11 += 2 * i; + // a12 += 2 * i; + // a13 += 2 * i; + // a14 += 2 * i; + // a15 += 2 * i; + // a16 += 2 * i; + b += 32 * i; + + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 32; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + + if (i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 8) { + b[ 0] = *(a08 + 0); + b[ 1] = *(a08 + 1); + b[ 2] = *(a08 + 2); + b[ 3] = *(a08 + 3); + b[ 4] = *(a08 + 4); + b[ 5] = *(a08 + 5); + b[ 6] = *(a08 + 6); + b[ 7] = *(a08 + 7); + b[ 8] = *(a08 + 8); + b[ 9] = *(a08 + 9); + b[ 10] = *(a08 + 10); + b[ 11] = *(a08 + 11); + b[ 12] = *(a08 + 12); + b[ 13] = *(a08 + 13); +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 9) { + b[ 0] = *(a09 + 0); + b[ 1] = *(a09 + 1); + b[ 2] = *(a09 + 2); + b[ 3] = *(a09 + 3); + b[ 4] = *(a09 + 4); + b[ 5] = *(a09 + 5); + b[ 6] = *(a09 + 6); + b[ 7] = *(a09 + 7); + b[ 8] = *(a09 + 8); + b[ 9] = *(a09 + 9); + b[ 10] = *(a09 + 10); + b[ 11] = *(a09 + 11); + b[ 12] = *(a09 + 12); + b[ 13] = *(a09 + 13); + b[ 14] = *(a09 + 14); + b[ 15] = *(a09 + 15); +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 10) { + b[ 0] = *(a10 + 0); + b[ 1] = *(a10 + 1); + b[ 2] = *(a10 + 2); + b[ 3] = *(a10 + 3); + b[ 4] = *(a10 + 4); + b[ 5] = *(a10 + 5); + b[ 6] = *(a10 + 6); + b[ 7] = *(a10 + 7); + b[ 8] = *(a10 + 8); + b[ 9] = *(a10 + 9); + b[ 10] = *(a10 + 10); + b[ 11] = *(a10 + 11); + b[ 12] = *(a10 + 12); + b[ 13] = *(a10 + 13); + b[ 14] = *(a10 + 14); + b[ 15] = *(a10 + 15); + b[ 16] = *(a10 + 16); + b[ 17] = *(a10 + 17); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 11) { + b[ 0] = *(a11 + 0); + b[ 1] = *(a11 + 1); + b[ 2] = *(a11 + 2); + b[ 3] = *(a11 + 3); + b[ 4] = *(a11 + 4); + b[ 5] = *(a11 + 5); + b[ 6] = *(a11 + 6); + b[ 7] = *(a11 + 7); + b[ 8] = *(a11 + 8); + b[ 9] = *(a11 + 9); + b[ 10] = *(a11 + 10); + b[ 11] = *(a11 + 11); + b[ 12] = *(a11 + 12); + b[ 13] = *(a11 + 13); + b[ 14] = *(a11 + 14); + b[ 15] = *(a11 + 15); + b[ 16] = *(a11 + 16); + b[ 17] = *(a11 + 17); + b[ 18] = *(a11 + 18); + b[ 19] = *(a11 + 19); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 12) { + b[ 0] = *(a12 + 0); + b[ 1] = *(a12 + 1); + b[ 2] = *(a12 + 2); + b[ 3] = *(a12 + 3); + b[ 4] = *(a12 + 4); + b[ 5] = *(a12 + 5); + b[ 6] = *(a12 + 6); + b[ 7] = *(a12 + 7); + b[ 8] = *(a12 + 8); + b[ 9] = *(a12 + 9); + b[ 10] = *(a12 + 10); + b[ 11] = *(a12 + 11); + b[ 12] = *(a12 + 12); + b[ 13] = *(a12 + 13); + b[ 14] = *(a12 + 14); + b[ 15] = *(a12 + 15); + b[ 16] = *(a12 + 16); + b[ 17] = *(a12 + 17); + b[ 18] = *(a12 + 18); + b[ 19] = *(a12 + 19); + b[ 20] = *(a12 + 20); + b[ 21] = *(a12 + 21); +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 13) { + b[ 0] = *(a13 + 0); + b[ 1] = *(a13 + 1); + b[ 2] = *(a13 + 2); + b[ 3] = *(a13 + 3); + b[ 4] = *(a13 + 4); + b[ 5] = *(a13 + 5); + b[ 6] = *(a13 + 6); + b[ 7] = *(a13 + 7); + b[ 8] = *(a13 + 8); + b[ 9] = *(a13 + 9); + b[ 10] = *(a13 + 10); + b[ 11] = *(a13 + 11); + b[ 12] = *(a13 + 12); + b[ 13] = *(a13 + 13); + b[ 14] = *(a13 + 14); + b[ 15] = *(a13 + 15); + b[ 16] = *(a13 + 16); + b[ 17] = *(a13 + 17); + b[ 18] = *(a13 + 18); + b[ 19] = *(a13 + 19); + b[ 20] = *(a13 + 20); + b[ 21] = *(a13 + 21); + b[ 22] = *(a13 + 22); + b[ 23] = *(a13 + 23); +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 14) { + b[ 0] = *(a14 + 0); + b[ 1] = *(a14 + 1); + b[ 2] = *(a14 + 2); + b[ 3] = *(a14 + 3); + b[ 4] = *(a14 + 4); + b[ 5] = *(a14 + 5); + b[ 6] = *(a14 + 6); + b[ 7] = *(a14 + 7); + b[ 8] = *(a14 + 8); + b[ 9] = *(a14 + 9); + b[ 10] = *(a14 + 10); + b[ 11] = *(a14 + 11); + b[ 12] = *(a14 + 12); + b[ 13] = *(a14 + 13); + b[ 14] = *(a14 + 14); + b[ 15] = *(a14 + 15); + b[ 16] = *(a14 + 16); + b[ 17] = *(a14 + 17); + b[ 18] = *(a14 + 18); + b[ 19] = *(a14 + 19); + b[ 20] = *(a14 + 20); + b[ 21] = *(a14 + 21); + b[ 22] = *(a14 + 22); + b[ 23] = *(a14 + 23); + b[ 24] = *(a14 + 24); + b[ 25] = *(a14 + 25); +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 15) { + b[ 0] = *(a15 + 0); + b[ 1] = *(a15 + 1); + b[ 2] = *(a15 + 2); + b[ 3] = *(a15 + 3); + b[ 4] = *(a15 + 4); + b[ 5] = *(a15 + 5); + b[ 6] = *(a15 + 6); + b[ 7] = *(a15 + 7); + b[ 8] = *(a15 + 8); + b[ 9] = *(a15 + 9); + b[ 10] = *(a15 + 10); + b[ 11] = *(a15 + 11); + b[ 12] = *(a15 + 12); + b[ 13] = *(a15 + 13); + b[ 14] = *(a15 + 14); + b[ 15] = *(a15 + 15); + b[ 16] = *(a15 + 16); + b[ 17] = *(a15 + 17); + b[ 18] = *(a15 + 18); + b[ 19] = *(a15 + 19); + b[ 20] = *(a15 + 20); + b[ 21] = *(a15 + 21); + b[ 22] = *(a15 + 22); + b[ 23] = *(a15 + 23); + b[ 24] = *(a15 + 24); + b[ 25] = *(a15 + 25); + b[ 26] = *(a15 + 26); + b[ 27] = *(a15 + 27); +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a02 + 0); + b[ 17] = *(a02 + 1); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a03 + 0); + b[ 33] = *(a03 + 1); + b[ 34] = *(a03 + 2); + b[ 35] = *(a03 + 3); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a04 + 0); + b[ 49] = *(a04 + 1); + b[ 50] = *(a04 + 2); + b[ 51] = *(a04 + 3); + b[ 52] = *(a04 + 4); + b[ 53] = *(a04 + 5); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a05 + 0); + b[ 65] = *(a05 + 1); + b[ 66] = *(a05 + 2); + b[ 67] = *(a05 + 3); + b[ 68] = *(a05 + 4); + b[ 69] = *(a05 + 5); + b[ 70] = *(a05 + 6); + b[ 71] = *(a05 + 7); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a06 + 0); + b[ 81] = *(a06 + 1); + b[ 82] = *(a06 + 2); + b[ 83] = *(a06 + 3); + b[ 84] = *(a06 + 4); + b[ 85] = *(a06 + 5); + b[ 86] = *(a06 + 6); + b[ 87] = *(a06 + 7); + b[ 88] = *(a06 + 8); + b[ 89] = *(a06 + 9); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a07 + 0); + b[ 97] = *(a07 + 1); + b[ 98] = *(a07 + 2); + b[ 99] = *(a07 + 3); + b[100] = *(a07 + 4); + b[101] = *(a07 + 5); + b[102] = *(a07 + 6); + b[103] = *(a07 + 7); + b[104] = *(a07 + 8); + b[105] = *(a07 + 9); + b[106] = *(a07 + 10); + b[107] = *(a07 + 11); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a08 + 0); + b[113] = *(a08 + 1); + b[114] = *(a08 + 2); + b[115] = *(a08 + 3); + b[116] = *(a08 + 4); + b[117] = *(a08 + 5); + b[118] = *(a08 + 6); + b[119] = *(a08 + 7); + b[120] = *(a08 + 8); + b[121] = *(a08 + 9); + b[122] = *(a08 + 10); + b[123] = *(a08 + 11); + b[124] = *(a08 + 12); + b[125] = *(a08 + 13); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; */ + b += 16 * i; + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X > posY) { + + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a02 + 0); + b[ 9] = *(a02 + 1); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a03 + 0); + b[ 17] = *(a03 + 1); + b[ 18] = *(a03 + 2); + b[ 19] = *(a03 + 3); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a04 + 0); + b[ 25] = *(a04 + 1); + b[ 26] = *(a04 + 2); + b[ 27] = *(a04 + 3); + b[ 28] = *(a04 + 4); + b[ 29] = *(a04 + 5); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; */ + b += 8 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + b += 4; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } +#if 1 + } +#else + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b += 4; + } +#endif + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + a01 += 2; + } else { +#ifdef UNIT + if (X > posY) { +#endif + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#ifdef UNIT + } else { + b[ 0] = ONE; + b[ 1] = ZERO; + } +#endif + a01 += lda; + } + b += 2; + X ++; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_16.c b/kernel/generic/ztrsm_lncopy_16.c new file mode 100644 index 000000000..4fd72c13e --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_16.c @@ -0,0 +1,308 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + *(b + 16) = *(a9 + 0); + *(b + 17) = *(a9 + 1); + *(b + 18) = *(a10 + 0); + *(b + 19) = *(a10 + 1); + *(b + 20) = *(a11 + 0); + *(b + 21) = *(a11 + 1); + *(b + 22) = *(a12 + 0); + *(b + 23) = *(a12 + 1); + *(b + 24) = *(a13 + 0); + *(b + 25) = *(a13 + 1); + *(b + 26) = *(a14 + 0); + *(b + 27) = *(a14 + 1); + *(b + 28) = *(a15 + 0); + *(b + 29) = *(a15 + 1); + *(b + 30) = *(a16 + 0); + *(b + 31) = *(a16 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + a9 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_16.c b/kernel/generic/ztrsm_ltcopy_16.c new file mode 100644 index 000000000..e9aeae1ad --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_16.c @@ -0,0 +1,264 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 32; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + *(b + 16) = *(a1 + 16); + *(b + 17) = *(a1 + 17); + *(b + 18) = *(a1 + 18); + *(b + 19) = *(a1 + 19); + *(b + 20) = *(a1 + 20); + *(b + 21) = *(a1 + 21); + *(b + 22) = *(a1 + 22); + *(b + 23) = *(a1 + 23); + *(b + 24) = *(a1 + 24); + *(b + 25) = *(a1 + 25); + *(b + 26) = *(a1 + 26); + *(b + 27) = *(a1 + 27); + *(b + 28) = *(a1 + 28); + *(b + 29) = *(a1 + 29); + *(b + 30) = *(a1 + 30); + *(b + 31) = *(a1 + 31); + } + + b += 32; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_16.c b/kernel/generic/ztrsm_uncopy_16.c new file mode 100644 index 000000000..e84d96891 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_16.c @@ -0,0 +1,313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + *(b + 16) = *(a9 + 0); + *(b + 17) = *(a9 + 1); + *(b + 18) = *(a10 + 0); + *(b + 19) = *(a10 + 1); + *(b + 20) = *(a11 + 0); + *(b + 21) = *(a11 + 1); + *(b + 22) = *(a12 + 0); + *(b + 23) = *(a12 + 1); + *(b + 24) = *(a13 + 0); + *(b + 25) = *(a13 + 1); + *(b + 26) = *(a14 + 0); + *(b + 27) = *(a14 + 1); + *(b + 28) = *(a15 + 0); + *(b + 29) = *(a15 + 1); + *(b + 30) = *(a16 + 0); + *(b + 31) = *(a16 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + a9 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 1; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_16.c b/kernel/generic/ztrsm_utcopy_16.c new file mode 100644 index 000000000..efcea5c3f --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_16.c @@ -0,0 +1,261 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, data1, data2; + + lda *= 2; + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 32; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + *(b + 16) = *(a1 + 16); + *(b + 17) = *(a1 + 17); + *(b + 18) = *(a1 + 18); + *(b + 19) = *(a1 + 19); + *(b + 20) = *(a1 + 20); + *(b + 21) = *(a1 + 21); + *(b + 22) = *(a1 + 22); + *(b + 23) = *(a1 + 23); + *(b + 24) = *(a1 + 24); + *(b + 25) = *(a1 + 25); + *(b + 26) = *(a1 + 26); + *(b + 27) = *(a1 + 27); + *(b + 28) = *(a1 + 28); + *(b + 29) = *(a1 + 29); + *(b + 30) = *(a1 + 30); + *(b + 31) = *(a1 + 31); + } + + b += 32; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 17d15656a..ce9268b93 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -111,9 +111,13 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S -CGEMMKERNEL = cgemm_kernel_2x2_lsx.S -CGEMMONCOPY = cgemm_ncopy_2_lsx.S -CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMKERNEL = cgemm_kernel_16x4_lasx.S +CGEMMINCOPY = cgemm_ncopy_16_lasx.S +CGEMMITCOPY = cgemm_tcopy_16_lasx.S +CGEMMONCOPY = cgemm_ncopy_4_lasx.S +CGEMMOTCOPY = cgemm_tcopy_4_lasx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/cgemm_kernel_16x4_lasx.S b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S new file mode 100644 index 000000000..249abe102 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S @@ -0,0 +1,3757 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8 + xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + + xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14 + xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16 + + xvand.v U4, D0, D0 + + xvpermi.q U4, D1, 0x02 //1 2 9 10 3 4 11 12 + xvpermi.q D1, D0, 0x31 //5 6 13 14 7 8 15 16 + + xvst U4, TD, 0x00 + xvst D1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vand.v $vr8, $vr1, $vr1 + + vpermi.w $vr8, $vr0, 0x44 + vpermi.w $vr1, $vr0, 0xee + + vst $vr8, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_16_lasx.S b/kernel/loongarch64/cgemm_tcopy_16_lasx.S new file mode 100644 index 000000000..7d9eb94c8 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_16_lasx.S @@ -0,0 +1,741 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +#define F8 $f8 +#define F9 $f9 +#define F10 $f10 +#define F11 $f11 +#define F12 $f12 +#define F13 $f13 +#define F14 $f14 +#define F15 $f15 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + + PROLOGUE + + addi.d $sp, $sp, -56 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x04 + srai.d T1, N, 0x03 + slli.d T0, T0, 0x04 + slli.d T1, T1, 0x03 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x02 + srai.d T1, N, 0x01 + slli.d T0, T0, 0x02 + slli.d T1, T1, 0x01 + mul.d P4, M, T0 + mul.d P5, M, T1 + slli.d P4, P4, 0x03 + slli.d P5, P5, 0x03 + add.d P4, DST, P4 + add.d P5, DST, P5 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x07 + beq ZERO, J, .L_M7 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x400 + + srai.d I, N, 0x04 + addi.d J, J, -1 + beq ZERO, I, .L_N15 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S5, 0x40 + xvld U3, S5, 0x60 + xvld U4, S6, 0x00 + xvld U5, S6, 0x20 + xvld U6, S6, 0x40 + xvld U7, S6, 0x60 + + xvst U0, P1, 0x200 + xvst U1, P1, 0x220 + xvst U2, P1, 0x240 + xvst U3, P1, 0x260 + xvst U4, P1, 0x280 + xvst U5, P1, 0x2A0 + xvst U6, P1, 0x2C0 + xvst U7, P1, 0x2E0 + + xvld U0, S7, 0x00 + xvld U1, S7, 0x20 + xvld U2, S7, 0x40 + xvld U3, S7, 0x60 + xvld U4, S8, 0x00 + xvld U5, S8, 0x20 + xvld U6, S8, 0x40 + xvld U7, S8, 0x60 + + xvst U0, P1, 0x300 + xvst U1, P1, 0x320 + xvst U2, P1, 0x340 + xvst U3, P1, 0x360 + xvst U4, P1, 0x380 + xvst U5, P1, 0x3A0 + xvst U6, P1, 0x3C0 + xvst U7, P1, 0x3E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d S5, S5, 0x80 + addi.d S6, S6, 0x80 + addi.d S7, S7, 0x80 + addi.d S8, S8, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 + +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U0, P2, 0x100 + xvst U1, P2, 0x120 + xvst U2, P2, 0x140 + xvst U3, P2, 0x160 + xvst U4, P2, 0x180 + xvst U5, P2, 0x1A0 + xvst U6, P2, 0x1C0 + xvst U7, P2, 0x1E0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d P2, P2, 0x200 + +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + xvst U4, P3, 0x80 + xvst U5, P3, 0xA0 + xvst U6, P3, 0xC0 + xvst U7, P3, 0xE0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P3, P3, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + xvpermi.q U4, U5, 0x02 + xvpermi.q U6, U7, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + xvst U4, P4, 0x40 + xvst U6, P4, 0x60 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P4, P4, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fld.s F8, S5, 0x00 + fld.s F9, S5, 0x04 + + fld.s F10, S6, 0x00 + fld.s F11, S6, 0x04 + + fld.s F12, S7, 0x00 + fld.s F13, S7, 0x04 + + fld.s F14, S8, 0x00 + fld.s F15, S8, 0x04 + + fst.s F0, P5, 0x00 + fst.s F1, P5, 0x04 + fst.s F2, P5, 0x08 + fst.s F3, P5, 0x0c + fst.s F4, P5, 0x10 + fst.s F5, P5, 0x14 + fst.s F6, P5, 0x18 + fst.s F7, P5, 0x1c + fst.s F8, P5, 0x20 + fst.s F9, P5, 0x24 + fst.s F10, P5, 0x28 + fst.s F11, P5, 0x2c + fst.s F12, P5, 0x30 + fst.s F13, P5, 0x34 + fst.s F14, P5, 0x38 + fst.s F15, P5, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P5, P5, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x04 + beq ZERO, I, .L_4N15 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 + +.L_4N15: + andi I, N, 0x08 + beq ZERO, I, .L_4N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d P2, P2, 0x100 + +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P3, P3, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P4, P4, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P5, P5, 0x20 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x04 + beq ZERO, I, .L_2N15 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 + +.L_2N15: + andi I, N, 0x08 + beq ZERO, I, .L_2N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d P2, P2, 0x80 + +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P3, P3, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P4, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P4, P4, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P5, P5, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x04 + beq ZERO, I, .L_1N15 + +.L_1I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N15: + andi I, N, 0x08 + beq ZERO, I, .L_1N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + + addi.d S1, S1, 0x40 + addi.d P2, P2, 0x40 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + xvld U0, S1, 0x00 + + xvst U0, P3, 0x00 + + addi.d S1, S1, 0x20 + addi.d P3, P3, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x10 + addi.d P4, P4, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P5, 0x00 + + addi.d S1, S1, 0x08 + addi.d P5, P5, 0x08 + +.L_M0: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + addi.d $sp, $sp, 56 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lasx.S b/kernel/loongarch64/cgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..9ff8a35b8 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lasx.S @@ -0,0 +1,306 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + vst $vr2, S9, 0x20 + vst $vr3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + + xvst U0, S8, 0x00 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld $vr0, S1, 0x00 + + vst $vr0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index 5d2e960a2..8bdc03380 100644 --- a/param.h +++ b/param.h @@ -2845,21 +2845,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 #else #define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_M 16 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 16 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 8 #endif #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_M 8 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 From bf2310442b7eda57ad8089878518b3f43733efaf Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 21 Feb 2024 13:26:28 -0600 Subject: [PATCH 09/21] Fix get_num_cores for AIX. --- getarch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/getarch.c b/getarch.c index f879e6bbb..2b5459a5f 100644 --- a/getarch.c +++ b/getarch.c @@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #endif -#if defined(AIX) +#if defined(_AIX) +#include +#include #include #endif @@ -1870,11 +1872,13 @@ static int get_num_cores(void) { return count; -#elif defined(AIX) +#elif defined(_AIX) //returns the number of processors which are currently online count = sysconf(_SC_NPROCESSORS_ONLN); if (count <= 0) count = 2; - + + return count; + #else return 2; #endif From 9b24b3141985de115e6191b376b435117a32404a Mon Sep 17 00:00:00 2001 From: frjohnst Date: Wed, 21 Feb 2024 15:52:29 -0500 Subject: [PATCH 10/21] resolve second_ conflict which breaks xlf timef --- lapack-netlib/SRC/Makefile | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index de2242701..205a32d31 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -101,8 +101,10 @@ SCLAUX = la_constants.o \ slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ ssteqr.o ssterf.o slaisnan.o sisnan.o \ - slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o \ - ../INSTALL/second_$(TIMER).o + slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o +ifneq ($(F_COMPILER), IBM) +SCLAUX += ../INSTALL/second_$(TIMER).o +endif endif ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" @@ -124,7 +126,10 @@ DZLAUX = la_constants.o\ dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ dsteqr.o dsterf.o dlaisnan.o disnan.o \ dlartgp.o dlartgs.o ../INSTALL/droundup_lwork.o \ - ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o + ../INSTALL/dlamch.o +ifneq ($(F_COMPILER), IBM) +DZLAUX += ../INSTALL/dsecnd_$(TIMER).o +endif endif #ifeq ($(BUILD_SINGLE),1) From d51ffec3a20299532a1ef5a403df2760f7b5b653 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 22 Feb 2024 10:46:45 +0800 Subject: [PATCH 11/21] LoongArch64: Opt cgemv with LASX --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/cgemv_n_8_lasx.S | 383 ++++++++++++++++++++++++++ kernel/loongarch64/cgemv_t_8_lasx.S | 342 +++++++++++++++++++++++ kernel/loongarch64/loongarch64_asm.S | 240 ++++++++++++++++ 4 files changed, 968 insertions(+) create mode 100644 kernel/loongarch64/cgemv_n_8_lasx.S create mode 100644 kernel/loongarch64/cgemv_t_8_lasx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index ce9268b93..3b2ee6e55 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -121,6 +121,9 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMVNKERNEL = cgemv_n_8_lasx.S +CGEMVTKERNEL = cgemv_t_8_lasx.S + CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S new file mode 100644 index 000000000..b078e3227 --- /dev/null +++ b/kernel/loongarch64/cgemv_n_8_lasx.S @@ -0,0 +1,383 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2024/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 +#define TMP0 $xr28 +#define TMP1 $xr29 +#define TMP2 $xr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro CLOAD_X_8 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ + X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 + GCOMPLEXMUL GXCONJ, \ + xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ + X3, X3, VALPHA, TMP0, TMP1, TMP2, \ + X4, X4, VALPHA, TMP0, TMP1, TMP2, \ + X5, X5, VALPHA, TMP0, TMP1, TMP2, \ + X6, X6, VALPHA, TMP0, TMP1, TMP2, \ + X7, X7, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_8_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X4, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X5, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X6, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X7, T0, 0x00 + + GCOMPLEXMUL GXCONJ, \ + xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ + X3, X3, VALPHA, TMP0, TMP1, TMP2, \ + X4, X4, VALPHA, TMP0, TMP1, TMP2, \ + X5, X5, VALPHA, TMP0, TMP1, TMP2, \ + X6, X6, VALPHA, TMP0, TMP1, TMP2, \ + X7, X7, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_8 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro CLOAD_Y_8_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + PTR_ALSL T0, INC_Y, Y, 2 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + PTR_ADD T0, T0, INC_Y + PTR_ADD T0, T0, INC_Y + fld.d $f18, T0, 0 + fldx.d $f19, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 +.endm + +.macro CSTORE_Y_8_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 + + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 3 +.endm + +.macro CGEMV_N_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2, \ + Y0, X4, A8, Y0, TMP0, TMP1, TMP2, Y1, X4, A9, Y1, TMP0, TMP1, TMP2, \ + Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \ + Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \ + Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2 +.endm + +.macro CSTORE_Y_8 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro CLOAD_X_1 + GLDREPL xv, d, X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro CGEMV_N_1x8 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ + $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, \ + Y0, X4, A8, Y0, TMP0, TMP1, TMP2, \ + Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \ + Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \ + Y0, X7, A14, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +.macro CGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + CLOAD_\X_8 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + CLOAD_\Y_8 + CGEMV_N_8x8 + CSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 3 + PTR_ADDI K, K, 8 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x8 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + PTR_ALSL X, INC_X, X, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 7 + beqz J, .L_END +.L_\XW\()_N_L1: + CLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x1 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + xvpackev.w $xr0, $xr1, $xr0 + xvreplve0.d VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_8_lasx.S b/kernel/loongarch64/cgemv_t_8_lasx.S new file mode 100644 index 000000000..94e4bd2eb --- /dev/null +++ b/kernel/loongarch64/cgemv_t_8_lasx.S @@ -0,0 +1,342 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2022/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define TMP0 $xr27 +#define TMP1 $xr28 +#define TMP2 $xr29 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y8 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ + TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro CLOAD_X8 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro CLOAD_X8_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 + PTR_ALSL T0, INC_X, X, 2 + fld.d $f2, T0, 0x00 + fldx.d $f3, T0, INC_X + PTR_ALSL T0, INC_X, T0, 1 + fld.d $f4, T0, 0x00 + fldx.d $f5, T0, INC_X + GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 +.endm + +.macro CGEMV_T_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2, \ + TP4, A8, X0, TP4, TMP0, TMP1, TMP2, TP4, A9, X1, TP4, TMP0, TMP1, TMP2, \ + TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \ + TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \ + TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_T_LASX XW:req, X8:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + ZERO_Y8 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + CLOAD_\X8 + CGEMV_T_8x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + // Accumulated + GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ + Y5, TP5, Y6, TP6, Y7, TP7 + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 + fld.d $f15, PA4, 0x00 + fld.d $f16, PA5, 0x00 + fld.d $f17, PA6, 0x00 + fld.d $f18, PA7, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#else + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \ + A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \ + A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + PTR_ALSL PY1, INC_Y, Y, 2 + fld.d $f15, PY1, 0x00 + fldx.d $f16, PY1, INC_Y + PTR_ALSL PY2, INC_Y, PY1, 1 + fld.d $f17, PY2, 0x00 + fldx.d $f18, PY2, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\ + A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\ + A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + fst.d $f15, PY1, 0x00 + fstx.d $f16, PY1, INC_Y + fst.d $f17, PY2, 0x00 + fstx.d $f18, PY2, INC_Y + PTR_ALSL Y, INC_Y, Y, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 7 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M8 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + fld.d $f3, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + fst.d $f3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + xvpackev.w $xr0, $xr1, $xr0 + xvreplve0.d VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + CGEMV_T_LASX GAP_0, X8 +.L_GAP_1: /* if (incx != 1) */ + CGEMV_T_LASX GAP_1, X8_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S index 694dcdaa9..fee46d63e 100644 --- a/kernel/loongarch64/loongarch64_asm.S +++ b/kernel/loongarch64/loongarch64_asm.S @@ -384,6 +384,246 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endif .endm +// +// GCOMPLEXACC: Complex accumulate the values of vector registers +// pre_op: xvf or vf, differentiate between LSX or LASX instruction +// suf_op: s or d, differentiate between single precision or double precision complex numbers +// Note: When "pre_op = xvf && suf_op = s", in will be modified. +// +.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvpermi.q \out, \in, 0x01 +.ifeqs "\suf_op", "s" + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.else + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "vf" +.ifeqs "\suf_op", "s" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifnb \more + GCOMPLEXACC \pre_op, \suf_op, \more +.endif +.endm + +// +// GCOMPLEXMUL: Complex multiplication, out = in0 * in1 +// xconj: default value 0. +// if !(xconj) +// out_r = in0_r * in1_r - in0_i * in1_i; +// out_i = in0_r * in1_i + in0_i * in1_r; +// else +// out_r = in0_r * in1_r + in0_i * in1_i; +// out_i = in0_r * in1_i - in0_i * in1_r; +// pre_op: xvf or vf, differentiate between LSX or LASX instruction +// suf_op: s or d, differentiate between single precision or double precision complex numbers +// +.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + xvpackev.w \tmp0, \in0, \in0 +.else + xvpackev.d \tmp0, \in0, \in0 +.endif +.else + vxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + vpackev.w \tmp0, \in0, \in0 +.else + vpackev.d \tmp0, \in0, \in0 +.endif +.endif + + \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 + +.ifeqs "\pre_op", "xvf" +.ifeqs "\suf_op", "s" +.ifeqs "\xconj", "0" + xvpackod.w \tmp1, \in0, \tmp1 +.else + xvpackod.w \tmp1, \tmp1, \in0 +.endif + xvshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\xconj", "0" + xvpackod.d \tmp1, \in0, \tmp1 +.else + xvpackod.d \tmp1, \tmp1, \in0 +.endif + xvshuf4i.d \tmp2, \in1, 0x0b +.endif +.else +.ifeqs "\suf_op", "s" +.ifeqs "\xconj", "0" + vpackod.w \tmp1, \in0, \tmp1 +.else + vpackod.w \tmp1, \tmp1, \in0 +.endif + vshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\xconj", "0" + vpackod.d \tmp1, \in0, \tmp1 +.else + vpackod.d \tmp1, \tmp1, \in0 +.endif + vshuf4i.d \tmp2, \in1, 0x0b +.endif +.endif + + \pre_op\()mul.\suf_op \out, \tmp0, \in1 + \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out + +.ifnb \more + GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more +.endif +.endm + +// +// GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2 +// xconj: default value 0 +// conj: default value 0 +// if !(CONJ) +// if !(XCONJ) +// out_r = in0_r * in1_r - in0_i * in1_i + in2_r; +// out_i = in0_r * in1_i + in0_i * in1_r + in2_i; +// else +// out_r = in0_r * in1_r + in0_i * in1_i + in2_r; +// out_i = in0_r * in1_i - in0_i * in1_r + in2_i; +// else +// if !(XCONJ) +// out_r = in0_r * in1_r + in0_i * in1_i + in2_r; +// out_i = in2_i - (in0_r * in1_i - in0_i * in1_r); +// else +// out_r = in0_r * in1_r - in0_i * in1_i + in2_r; +// out_i = in2_i - (in0_r * in1_i + in0_i * in1_r); +// pre_op: xvf or vf, differentiate between LSX or LASX instruction +// suf_op: s or d, differentiate between single precision or double precision complex numbers +// +.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + xvpackev.w \tmp0, \in0, \in0 +.else + xvpackev.d \tmp0, \in0, \in0 +.endif +.else + vxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + vpackev.w \tmp0, \in0, \in0 +.else + vpackev.d \tmp0, \in0, \in0 +.endif +.endif + + \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 +.ifeqs "\conj", "1" + \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 +.ifeqs "\pre_op", "xvf" +.ifeqs "\suf_op", "s" + xvshuf4i.w \tmp0, \tmp0, 0xb1 + xvpackev.w \out, \tmp0, \tmp2 +.else + xvshuf4i.d \tmp0, \tmp0, 0x0b + xvpackev.d \out, \tmp0, \tmp2 +.endif +.else +.ifeqs "\suf_op", "s" + vshuf4i.w \tmp0, \tmp0, 0xb1 + vpackev.w \out, \tmp0, \tmp2 +.else + vshuf4i.d \tmp0, \tmp0, 0x0b + vpackev.d \out, \tmp0, \tmp2 +.endif +.endif /* pre_op = xvf */ +.else + \pre_op\()add.\suf_op \out, \tmp2, \tmp1 +.endif /* conj = 1 */ + + \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 + +.ifeqs "\pre_op", "xvf" +.ifeqs "\suf_op", "s" +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + xvpackod.w \tmp1, \in0, \tmp1 +.else + xvpackod.w \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + xvpackod.w \tmp1, \in0, \in0 +.else + xvpackod.w \tmp1, \tmp1, \tmp1 +.endif +.endif + xvshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + xvpackod.d \tmp1, \in0, \tmp1 +.else + xvpackod.d \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + xvpackod.d \tmp1, \in0, \in0 +.else + xvpackod.d \tmp1, \tmp1, \tmp1 +.endif +.endif + xvshuf4i.d \tmp2, \in1, 0x0b +.endif +.else +.ifeqs "\suf_op", "s" +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + vpackod.w \tmp1, \in0, \tmp1 +.else + vpackod.w \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + vpackod.w \tmp1, \in0, \in0 +.else + vpackod.w \tmp1, \tmp1, \tmp1 +.endif +.endif + vshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + vpackod.d \tmp1, \in0, \tmp1 +.else + vpackod.d \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + vpackod.d \tmp1, \in0, \in0 +.else + vpackod.d \tmp1, \tmp1, \tmp1 +.endif +.endif + vshuf4i.d \tmp2, \in1, 0x0b +.endif +.endif + + \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out + +.ifnb \more + GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more +.endif +.endm + // // Media Related Macros // From 990507e3b8f833de26da1b2ab6dff0972ae65c3f Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 22 Feb 2024 11:41:15 +0800 Subject: [PATCH 12/21] LoongArch64: Opt zgemv with LASX --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/zgemv_n_4_lasx.S | 343 ++++++++++++++++++++++++++ kernel/loongarch64/zgemv_t_4_lasx.S | 299 ++++++++++++++++++++++ 3 files changed, 645 insertions(+) create mode 100644 kernel/loongarch64/zgemv_n_4_lasx.S create mode 100644 kernel/loongarch64/zgemv_t_4_lasx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 3b2ee6e55..9b55d1bbb 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -139,6 +139,9 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMVNKERNEL = zgemv_n_4_lasx.S +ZGEMVTKERNEL = zgemv_t_4_lasx.S + ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S new file mode 100644 index 000000000..98b1a6f7d --- /dev/null +++ b/kernel/loongarch64/zgemv_n_4_lasx.S @@ -0,0 +1,343 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2024/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M16 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 +#define TMP0 $xr28 +#define TMP1 $xr29 +#define TMP2 $xr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro ZLOAD_X_4 + GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 + GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 + GCOMPLEXMUL GXCONJ, \ + xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ + X3, X3, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_X_4_GAP + xvld X0, X, 0 + xvpermi.q X0, X0, 0 + + PTR_ADD T0, X, INC_X + xvld X1, T0, 0 + xvpermi.q X1, X1, 0 + + PTR_ADD T0, T0, INC_X + xvld X2, T0, 0 + xvpermi.q X2, X2, 0 + + PTR_ADD T0, T0, INC_X + xvld X3, T0, 0 + xvpermi.q X3, X3, 0 + + GCOMPLEXMUL GXCONJ, \ + xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ + X3, X3, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_Y_4 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro ZLOAD_Y_4_GAP + vld $vr10, Y, 0 + vldx $vr13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + vld $vr11, T0, 0 + vldx $vr14, T0, INC_Y + GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02 +.endm + +.macro ZGEMV_N_4x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 +.endm + +.macro ZSTORE_Y_4 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro ZSTORE_Y_4_GAP + xvstelm.d Y0, Y, 0, 0 + xvstelm.d Y0, Y, 0x08, 1 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 2 + xvstelm.d Y0, T0, 0x08, 3 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + xvstelm.d Y1, T0, 0x08, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + xvstelm.d Y1, T0, 0x08, 3 +.endm + +.macro ZLOAD_Y_1 + vld $vr10, Y, 0 +.endm + +.macro ZGEMV_N_1x4 + GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZSTORE_Y_1 + vst $vr10, Y, 0 +.endm + +.macro ZLOAD_X_1 + GLD xv, , X0, X, 0x00 + GPERMI xv, q, X0, X0, 0 + GCOMPLEXMUL GXCONJ, \ + xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_1x1 + GLD_INC v, , 0x10, $vr12, PA0, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L4: + ZLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + ZLOAD_\Y_4 + ZGEMV_N_4x4 + ZSTORE_\Y_4 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x4 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END +.L_\XW\()_N_L1: + ZLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x1 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M16 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + xvpackev.d $xr0, $xr1, $xr0 + xvreplve0.q VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE + diff --git a/kernel/loongarch64/zgemv_t_4_lasx.S b/kernel/loongarch64/zgemv_t_4_lasx.S new file mode 100644 index 000000000..4d33b8f96 --- /dev/null +++ b/kernel/loongarch64/zgemv_t_4_lasx.S @@ -0,0 +1,299 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2024/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M16 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define TMP0 $xr27 +#define TMP1 $xr28 +#define TMP2 $xr29 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y4 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro ZLOAD_X4 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro ZLOAD_X4_GAP + xvld X0, X, 0 + + PTR_ADD T0, X, INC_X + xvld A0, T0, 0 + xvpermi.q X0, A0, 0x02 + + PTR_ADD T0, T0, INC_X + xvld X1, T0, 0 + + PTR_ADD T0, T0, INC_X + xvld A0, T0, 0 + xvpermi.q X1, A0, 0x02 +.endm + +.macro ZGEMV_T_4x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_T_LASX XW:req, X4:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L4: + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + ZLOAD_\X4 + ZGEMV_T_4x4 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 2 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + // Accumulated + GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 +#else + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + xvld A8, Y, 0x00 + xvldx A9, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + xvld A10, PY0, 0x00 + xvldx A11, PY0, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + vst $vr11, Y, 0x00 + vstx $vr12, Y, INC_Y + vst $vr13, PY0, 0x00 + vstx $vr14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M16 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + GLD xv, , A0, PA0, 0x00, X0, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x10 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + xvld A0, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + vst $vr3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + xvpackev.d $xr0, $xr1, $xr0 + xvreplve0.q VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + ZGEMV_T_LASX GAP_0, X4 +.L_GAP_1: /* if (incx != 1) */ + ZGEMV_T_LASX GAP_1, X4_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE From 892f8ff3e55e24fda9af3f6364319cce3f60116b Mon Sep 17 00:00:00 2001 From: Ayappan Perumal Date: Thu, 22 Feb 2024 07:05:37 -0600 Subject: [PATCH 13/21] Shared library support for AIX --- Makefile | 3 +++ Makefile.system | 4 ---- exports/Makefile | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index b344abcd2..19bab2915 100644 --- a/Makefile +++ b/Makefile @@ -152,6 +152,9 @@ endif ifeq ($(OSNAME), CYGWIN_NT) @$(MAKE) -C exports dll endif +ifeq ($(OSNAME), AIX) + @$(MAKE) -C exports so +endif endif tests : shared diff --git a/Makefile.system b/Makefile.system index 0088eaff5..f7ccc7746 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1707,11 +1707,7 @@ endif LIBDLLNAME = $(LIBPREFIX).dll IMPLIBNAME = lib$(LIBNAMEBASE).dll.a -ifneq ($(OSNAME), AIX) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) -else -LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) -endif LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) diff --git a/exports/Makefile b/exports/Makefile index 7682f851d..cf948ccb2 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -73,6 +73,10 @@ endif endif endif +ifeq ($(F_COMPILER)$(OSNAME), IBMAIX) +EXTRALIB += -lxlf90 +endif + ifeq ($(C_COMPILER), PGI) EXTRALIB += -pgf90libs endif @@ -248,6 +252,20 @@ endif ifeq ($(OSNAME), AIX) +so : ../$(LIBSONAME) linktest.c + $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK. + rm -f linktest + +../$(LIBSONAME) : aix.exp + $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB) + +aix.exp : + /usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \ + || ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \ + { if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \ + /usr/bin/sort -u > aix.exp + ifeq ($(COMPILER_F77), xlf) goto32.$(SUFFIX) : ../$(LIBNAME) aix.def From 82b81c0bbee5d1344dcd2fb0f50cba6a7c0ded9f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:11:50 +0100 Subject: [PATCH 14/21] Dont fail if there is no Fortran compiler --- cmake/f_check.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index df3a4858d..4c4f5ac04 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -64,6 +64,7 @@ else () "#define NEEDBUNDERSCORE 1\n") endif() +if (CMAKE_Fortran_COMPILER) get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) string(TOUPPER ${F_COMPILER} F_COMPILER) - +endif() From 8fc2c2db043eda5e415c1673779c8bdcd91870fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:14:13 +0100 Subject: [PATCH 15/21] Fix missing support for INTERFACE64 on ARM64 and MIPS64 --- cmake/fc.cmake | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 5c30be843..bc85a2921 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -6,9 +6,6 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") # This is for classic Flang. LLVM Flang is handled with gfortran below. set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") - if (BINARY64 AND INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -i8") - endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () @@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F if (MIPS64) if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () else () set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") endif () @@ -83,6 +83,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () endif () endif () + if (ARM64 AND INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") From 3516fff378cf5d9153d18c86d9a117b0976e777d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:15:28 +0100 Subject: [PATCH 16/21] Avoid linking both libgomp and libomp in mixed clang/gfortran builds --- ctest/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 91338b73b..6e0a7f309 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -40,6 +40,10 @@ else() c_${float_char}blas1.c) endif() target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat1 omp pthread) + endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat1 m) endif() @@ -65,6 +69,10 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat2 omp pthread) + endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat2 m) endif() @@ -90,6 +98,10 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat3 omp pthread) + endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() From 4adfe4d53185233bfbeeb362734a2d80814d4457 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:16:01 +0100 Subject: [PATCH 17/21] Avoid linking both libgomp and libomp in mixed clang/gfortran builds --- test/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d68b12d87..ace20dffc 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -21,10 +21,14 @@ endif() if (BUILD_COMPLEX16) list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) endif() - +message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID}) foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) +if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") +target_link_libraries(${test_bin} omp pthread) +endif() endforeach() # $1 exec, $2 input, $3 output_result From ca121eb5eda1c635f1353d261de5783e079c21c1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:17:05 +0100 Subject: [PATCH 18/21] Avoid linking both libgomp and libomp in mixed clang/gfortran builds --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index e7236677a..b69417853 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -107,6 +107,10 @@ set(ZDMDEIGTST zchkdmd.f90) macro(add_eig_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) +if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") +target_link_libraries(${name} omp pthread) +endif() #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() From be20588a3c7c0725cd38846ce3408604fbfb2c95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:17:48 +0100 Subject: [PATCH 19/21] Avoid linking both libgomp and libomp in mixed clang/gfortran builds --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 143fd0597..9ae0cea79 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -240,6 +240,10 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(${name} omp pthread) + endif() #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() From 16b488cabe8c6113ba41bf140f4dfe3cace6b9ac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:38:05 +0100 Subject: [PATCH 20/21] CI: Add various Apple M1 build configurations to gh workflow --- .github/workflows/apple_m.yml | 149 ++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 .github/workflows/apple_m.yml diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml new file mode 100644 index 000000000..e34eada86 --- /dev/null +++ b/.github/workflows/apple_m.yml @@ -0,0 +1,149 @@ +name: apple m + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: macos-14 + + strategy: + fail-fast: false + matrix: + build: [cmake, make] + fortran: [gfortran] + openmp: [0, 1] + ilp64: [0, 1] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + elif [ "$RUNNER_OS" == "macOS" ]; then + sysctl -a | grep machdep.cpu + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get install -y gfortran cmake ccache libtinfo5 + elif [ "$RUNNER_OS" == "macOS" ]; then + # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. + brew reinstall gcc + brew install coreutils cmake ccache + brew install llvm + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + # GNU make and cmake call the compilers differently. It looks like + # that causes the cache to mismatch. Keep the ccache for both build + # tools separate to avoid polluting each other. + key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} + # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }} + ccache-${{ runner.os }}-${{ matrix.build }} + + - name: Configure ccache + run: | + if [ "${{ matrix.build }}" = "make" ]; then + # Add ccache to path + if [ "$RUNNER_OS" = "Linux" ]; then + echo "/usr/lib/ccache" >> $GITHUB_PATH + elif [ "$RUNNER_OS" = "macOS" ]; then + echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH + echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH + echo "" >>$GITHUB_PATH + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + fi + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Build OpenBLAS + run: | + export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + export CC="/opt/homebrew/opt/llvm/bin/clang" + case "${{ matrix.build }}" in + "make") + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}" + ;; + "cmake") + export LDFLAGS="$LDFLAGS -Wl,-ld_classic" + mkdir build && cd build + cmake -DDYNAMIC_ARCH=1 \ + -DUSE_OPENMP=${{matrix.openmp}} \ + -DINTERFACE64=${{matrix.ilp64}} \ + -DNOFORTRAN=0 \ + -DBUILD_WITHOUT_LAPACK=0 \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ + .. + cmake --build . + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Run tests + timeout-minutes: 60 + run: | + case "${{ matrix.build }}" in + "make") + MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0' + echo "::group::Tests in 'test' directory" + make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'ctest' directory" + make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'utest' directory" + make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + ;; + "cmake") + cd build && ctest + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac From 5b953f2f8d3a3f138e4be4ed28624ee25826c968 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:41:08 +0100 Subject: [PATCH 21/21] Disable most AppleM1 builds (replaced by gh workflows) --- .cirrus.yml | 70 ++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index b4c4870d0..9a898f421 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -1,44 +1,44 @@ macos_instance: image: ghcr.io/cirruslabs/macos-monterey-xcode:latest -task: - name: AppleM1/LLVM - compile_script: - - brew install llvm - - export PATH=/opt/homebrew/opt/llvm/bin:$PATH - - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - make TARGET=VORTEX USE_OPENMP=1 CC=clang +#task: +# name: AppleM1/LLVM +# compile_script: +# - brew install llvm +# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" +# - make TARGET=VORTEX USE_OPENMP=1 CC=clang -task: - name: AppleM1/LLVM/ILP64 - compile_script: - - brew install llvm - - export PATH=/opt/homebrew/opt/llvm/bin:$PATH - - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 +#task: +# name: AppleM1/LLVM/ILP64 +# compile_script: +# - brew install llvm +# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" +# - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 -task: - name: AppleM1/LLVM/CMAKE - compile_script: - - brew install llvm - - export PATH=/opt/homebrew/opt/llvm/bin:$PATH - - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - mkdir build - - cd build - - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. - - make -j 4 +#task: +# name: AppleM1/LLVM/CMAKE +# compile_script: +# - brew install llvm +# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" +# - mkdir build +# - cd build +# - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. +# - make -j 4 -task: - name: AppleM1/GCC/MAKE/OPENMP - compile_script: - - brew install gcc@11 - - export PATH=/opt/homebrew/bin:$PATH - - export LDFLAGS="-L/opt/homebrew/lib" - - export CPPFLAGS="-I/opt/homebrew/include" - - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 +#task: +# name: AppleM1/GCC/MAKE/OPENMP +# compile_script: +# - brew install gcc@11 +# - export PATH=/opt/homebrew/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/lib" +# - export CPPFLAGS="-I/opt/homebrew/include" +# - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 macos_instance: image: ghcr.io/cirruslabs/macos-monterey-xcode:latest