Merge pull request #3536 from xianyi/develop
Update from develop for release 0.3.20
This commit is contained in:
		
						commit
						15ff556862
					
				|  | @ -251,12 +251,14 @@ if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) | |||
|   set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | ||||
|   set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | ||||
|  "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | ||||
|  "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | ||||
|  "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | ||||
|  "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | ||||
|  "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | ||||
|   else () | ||||
|   set (CMAKE_C_CREATE_SHARED_LIBRARY | ||||
|    "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | ||||
|    "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | ||||
|    "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | ||||
|   endif () | ||||
| endif() | ||||
|  |  | |||
|  | @ -201,3 +201,9 @@ In chronological order: | |||
| * Bine Brank <https://github.com/binebrank> | ||||
|   * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE | ||||
|   * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM | ||||
|   * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions | ||||
|   * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions | ||||
|   * [2022-01-18] SVE kernels and copy functions for TRSM | ||||
| 
 | ||||
| * Ilya Kurdyukov <https://github.com/ilyakurdyukov> | ||||
|   * [2021-02-21] Add basic support for the Elbrus E2000 architecture | ||||
|  |  | |||
|  | @ -1,4 +1,39 @@ | |||
| OpenBLAS ChangeLog | ||||
| ==================================================================== | ||||
| Version 0.3.20 | ||||
|  20-Feb-2022 | ||||
| 
 | ||||
| general: | ||||
|  - some code cleanup, with added casts etc. | ||||
|  - fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset | ||||
|  - fixed pivot index calculation by ?LASWP for negative increments other than one | ||||
|  - fixed input argument check in LAPACK ? GEQRT2 | ||||
|  - improved the check for a Fortran compiler in CMAKE builds | ||||
|  - disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1 | ||||
|  - fixed building of LAPACK on certain distributed filesystems with parallel gmake | ||||
|  - fixed building the shared library on MacOS with classic flang | ||||
| 
 | ||||
| x86_64: | ||||
|  - fixed cross-compilation with CMAKE for CORE2 target | ||||
|  - fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds | ||||
|  - added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS | ||||
| 
 | ||||
| E2K: | ||||
|  - add new architecture (Russian Elbrus E2000 family) | ||||
| 
 | ||||
| SPARC: | ||||
|  - fix IMIN/IMAX | ||||
| 
 | ||||
| ARMV8: | ||||
|  - added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX | ||||
|  - added support for Neoverse N2 and V1 cpus | ||||
| 
 | ||||
| MIPS,MIPS64: | ||||
|  - fixed autodetection of MSA capability | ||||
| 
 | ||||
| LOONGARCH64: | ||||
|  - added an optimized DGEMM kernel | ||||
| 
 | ||||
| ==================================================================== | ||||
| Version 0.3.19 | ||||
|  19-Dec-2021 | ||||
|  |  | |||
|  | @ -78,6 +78,66 @@ endif | |||
| endif | ||||
| endif | ||||
| 
 | ||||
| # Use a72 tunings because Neoverse-V1 is only available
 | ||||
| # in GCC>=9.4
 | ||||
| ifeq ($(CORE), NEOVERSEV1) | ||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | ||||
| ifeq ($(GCCVERSIONGTEQ9), 1) | ||||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | ||||
| CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | ||||
| ifneq ($(F_COMPILER), NAG) | ||||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | ||||
| endif | ||||
| else | ||||
| CCOMMON_OPT += -march=armv8.4-a -mtune=native | ||||
| ifneq ($(F_COMPILER), NAG) | ||||
| FCOMMON_OPT += -march=armv8.4-a -mtune=native | ||||
| endif | ||||
| endif | ||||
| else | ||||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | ||||
| ifneq ($(F_COMPILER), NAG) | ||||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | ||||
| endif | ||||
| endif | ||||
| else | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | ||||
| ifneq ($(F_COMPILER), NAG) | ||||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | ||||
| endif | ||||
| endif | ||||
| endif | ||||
| 
 | ||||
| # Use a72 tunings because Neoverse-N2 is only available
 | ||||
| # in GCC>=9.4
 | ||||
| ifeq ($(CORE), NEOVERSEN2) | ||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | ||||
| ifeq ($(GCCVERSIONGTEQ9), 1) | ||||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | ||||
| CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | ||||
| ifneq ($(F_COMPILER), NAG) | ||||
| FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | ||||
| endif | ||||
| else | ||||
| CCOMMON_OPT += -march=armv8.5-a -mtune=native | ||||
| ifneq ($(F_COMPILER), NAG) | ||||
| FCOMMON_OPT += -march=armv8.5-a -mtune=native | ||||
| endif | ||||
| endif | ||||
| else | ||||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | ||||
| ifneq ($(F_COMPILER), NAG) | ||||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | ||||
| endif | ||||
| endif | ||||
| else | ||||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | ||||
| ifneq ($(F_COMPILER), NAG) | ||||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | ||||
| endif | ||||
| endif | ||||
| endif | ||||
| 
 | ||||
| # Use a53 tunings because a55 is only available in GCC>=8.1
 | ||||
| ifeq ($(CORE), CORTEXA55) | ||||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | ||||
|  |  | |||
|  | @ -0,0 +1 @@ | |||
| COPT	= -Wall -O2 # -DGEMMTEST | ||||
|  | @ -3,6 +3,10 @@ | |||
| export BINARY | ||||
| export USE_OPENMP | ||||
| 
 | ||||
| ifdef DYNAMIC_ARCH | ||||
| override HOST_CFLAGS += -DDYNAMIC_ARCH | ||||
| endif | ||||
| 
 | ||||
| ifdef TARGET_CORE | ||||
| TARGET_MAKE = Makefile_kernel.conf | ||||
| TARGET_CONF = config_kernel.h | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ | |||
| #
 | ||||
| 
 | ||||
| # This library's version
 | ||||
| VERSION = 0.3.19 | ||||
| VERSION = 0.3.19.dev | ||||
| 
 | ||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
 | ||||
|  |  | |||
|  | @ -277,7 +277,7 @@ HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo | |||
| GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) | ||||
| 
 | ||||
| # Generating Makefile.conf and config.h
 | ||||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | ||||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | ||||
| 
 | ||||
| ifndef TARGET_CORE | ||||
| include $(TOPDIR)/Makefile.conf | ||||
|  | @ -374,6 +374,7 @@ else | |||
| endif | ||||
| GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) | ||||
| GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) | ||||
| GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) | ||||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) | ||||
| endif | ||||
| 
 | ||||
|  | @ -654,6 +655,8 @@ DYNAMIC_CORE += CORTEXA57 | |||
| DYNAMIC_CORE += CORTEXA72 | ||||
| DYNAMIC_CORE += CORTEXA73 | ||||
| DYNAMIC_CORE += NEOVERSEN1 | ||||
| DYNAMIC_CORE += NEOVERSEV1 | ||||
| DYNAMIC_CORE += NEOVERSEN2 | ||||
| DYNAMIC_CORE += CORTEXA55 | ||||
| DYNAMIC_CORE += FALKOR | ||||
| DYNAMIC_CORE += THUNDERX | ||||
|  |  | |||
|  | @ -93,6 +93,8 @@ CORTEXA57 | |||
| CORTEXA72 | ||||
| CORTEXA73 | ||||
| NEOVERSEN1 | ||||
| NEOVERSEV1 | ||||
| NEOVERSEN2 | ||||
| CORTEXA55 | ||||
| EMAG8180 | ||||
| FALKOR | ||||
|  | @ -113,3 +115,7 @@ C910V | |||
| 
 | ||||
| 11.LOONGARCH64: | ||||
| LOONGSON3R5 | ||||
| 
 | ||||
| 12. Elbrus E2000: | ||||
| E2K | ||||
| 
 | ||||
|  |  | |||
|  | @ -224,7 +224,7 @@ jobs: | |||
| 
 | ||||
| - job: OSX_IOS_ARMV8 | ||||
|   pool: | ||||
|      vmImage: 'macOS-10.15' | ||||
|      vmImage: 'macOS-11' | ||||
|   variables: | ||||
|      CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | ||||
|      CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 | ||||
|  |  | |||
							
								
								
									
										7
									
								
								c_check
								
								
								
								
							
							
						
						
									
										7
									
								
								c_check
								
								
								
								
							|  | @ -84,6 +84,7 @@ $os = Haiku           if ($data =~ /OS_HAIKU/); | |||
| 
 | ||||
| $architecture = x86          if ($data =~ /ARCH_X86/); | ||||
| $architecture = x86_64       if ($data =~ /ARCH_X86_64/); | ||||
| $architecture = e2k          if ($data =~ /ARCH_E2K/); | ||||
| $architecture = power        if ($data =~ /ARCH_POWER/); | ||||
| $architecture = mips         if ($data =~ /ARCH_MIPS/); | ||||
| $architecture = mips64       if ($data =~ /ARCH_MIPS64/); | ||||
|  | @ -124,6 +125,11 @@ if ($architecture eq "zarch") { | |||
|     $binary = 64; | ||||
| } | ||||
| 
 | ||||
| if ($architecture eq "e2k") { | ||||
|     $defined = 1; | ||||
|     $binary = 64; | ||||
| } | ||||
| 
 | ||||
| if ($architecture eq "alpha") { | ||||
|     $defined = 1; | ||||
|     $binary = 64; | ||||
|  | @ -223,6 +229,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| 
 | ||||
| $architecture = x86          if ($data =~ /ARCH_X86/); | ||||
| $architecture = x86_64       if ($data =~ /ARCH_X86_64/); | ||||
| $architecture = e2k          if ($data =~ /ARCH_E2K/); | ||||
| $architecture = power        if ($data =~ /ARCH_POWER/); | ||||
| $architecture = mips         if ($data =~ /ARCH_MIPS/); | ||||
| $architecture = mips64       if ($data =~ /ARCH_MIPS64/); | ||||
|  |  | |||
|  | @ -44,7 +44,7 @@ endif () | |||
| 
 | ||||
| if (DYNAMIC_ARCH) | ||||
|   if (ARM64) | ||||
| 	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | ||||
| 	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110) | ||||
|     if (DYNAMIC_LIST) | ||||
| 	    set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | ||||
|     endif () | ||||
|  |  | |||
|  | @ -20,19 +20,16 @@ | |||
| # NEEDBUNDERSCORE | ||||
| # NEED2UNDERSCORES | ||||
| 
 | ||||
| if (NOT NO_LAPACK) | ||||
|   include(CheckLanguage) | ||||
|   check_language(Fortran) | ||||
|   if(CMAKE_Fortran_COMPILER) | ||||
|     enable_language(Fortran) | ||||
|   else() | ||||
|   message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") | ||||
| include(CheckLanguage) | ||||
| check_language(Fortran) | ||||
| if(CMAKE_Fortran_COMPILER) | ||||
|   enable_language(Fortran) | ||||
| else() | ||||
|   if (NOT NO_LAPACK) | ||||
|     message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") | ||||
|   endif() | ||||
|   set (NOFORTRAN 1) | ||||
|   set (NO_LAPACK 1) | ||||
|   endif() | ||||
| else() | ||||
|   include(CMakeForceCompiler) | ||||
|   CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) | ||||
| endif() | ||||
| 
 | ||||
| if (NOT ONLY_CBLAS) | ||||
|  |  | |||
|  | @ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
|       "#define DLOCAL_BUFFER_SIZE\t16384\n" | ||||
|       "#define CLOCAL_BUFFER_SIZE\t16384\n" | ||||
|       "#define ZLOCAL_BUFFER_SIZE\t16384\n") | ||||
|       set(HAVE_SSE 1) | ||||
|       set(HAVE_SSE2 1) | ||||
|       set(HAVE_SSE3 1) | ||||
|       set(HAVE_SSSE3 1) | ||||
|       set(SGEMM_UNROLL_M 8) | ||||
|       set(SGEMM_UNROLL_N 4) | ||||
|       set(DGEMM_UNROLL_M 4) | ||||
|  | @ -243,11 +247,11 @@ endif () | |||
|       "#define L1_CODE_ASSOCIATIVE\t4\n" | ||||
|       "#define L1_DATA_SIZE\t65536\n" | ||||
|       "#define L1_DATA_LINESIZE\t64\n" | ||||
|       "#define L1_DATA_ASSOCIATIVE\t2\n" | ||||
|       "#define L1_DATA_ASSOCIATIVE\t4\n" | ||||
|       "#define L2_SIZE\t1048576\n\n" | ||||
|       "#define L2_LINESIZE\t64\n" | ||||
|       "#define L2_ASSOCIATIVE\t16\n" | ||||
|       "#define DTB_DEFAULT_ENTRIES\t64\n" | ||||
|       "#define L2_ASSOCIATIVE\t8\n" | ||||
|       "#define DTB_DEFAULT_ENTRIES\t48\n" | ||||
|       "#define DTB_SIZE\t4096\n" | ||||
|       "#define HAVE_VFPV4\n" | ||||
|       "#define HAVE_VFPV3\n" | ||||
|  | @ -263,6 +267,62 @@ endif () | |||
|     set(ZGEMM_UNROLL_M 4) | ||||
|     set(ZGEMM_UNROLL_N 4) | ||||
|     set(SYMV_P 16) | ||||
|   elseif ("${TCORE}" STREQUAL "NEOVERSEV1") | ||||
|     file(APPEND ${TARGET_CONF_TEMP} | ||||
|       "#define L1_CODE_SIZE\t65536\n" | ||||
|       "#define L1_CODE_LINESIZE\t64\n" | ||||
|       "#define L1_CODE_ASSOCIATIVE\t4\n" | ||||
|       "#define L1_DATA_SIZE\t65536\n" | ||||
|       "#define L1_DATA_LINESIZE\t64\n" | ||||
|       "#define L1_DATA_ASSOCIATIVE\t4\n" | ||||
|       "#define L2_SIZE\t1048576\n\n" | ||||
|       "#define L2_LINESIZE\t64\n" | ||||
|       "#define L2_ASSOCIATIVE\t8\n" | ||||
|       "#define DTB_DEFAULT_ENTRIES\t48\n" | ||||
|       "#define DTB_SIZE\t4096\n" | ||||
|       "#define HAVE_VFPV4\n" | ||||
|       "#define HAVE_VFPV3\n" | ||||
|       "#define HAVE_VFP\n" | ||||
|       "#define HAVE_NEON\n" | ||||
|       "#define HAVE_SVE\n" | ||||
|       "#define ARMV8\n") | ||||
|     set(SGEMM_UNROLL_M 16) | ||||
|     set(SGEMM_UNROLL_N 4) | ||||
|     set(DGEMM_UNROLL_M 8) | ||||
|     set(DGEMM_UNROLL_N 4) | ||||
|     set(CGEMM_UNROLL_M 8) | ||||
|     set(CGEMM_UNROLL_N 4) | ||||
|     set(ZGEMM_UNROLL_M 4) | ||||
|     set(ZGEMM_UNROLL_N 4) | ||||
|     set(SYMV_P 16) | ||||
|   elseif ("${TCORE}" STREQUAL "NEOVERSEN2") | ||||
|     file(APPEND ${TARGET_CONF_TEMP} | ||||
|       "#define L1_CODE_SIZE\t65536\n" | ||||
|       "#define L1_CODE_LINESIZE\t64\n" | ||||
|       "#define L1_CODE_ASSOCIATIVE\t4\n" | ||||
|       "#define L1_DATA_SIZE\t65536\n" | ||||
|       "#define L1_DATA_LINESIZE\t64\n" | ||||
|       "#define L1_DATA_ASSOCIATIVE\t2\n" | ||||
|       "#define L2_SIZE\t1048576\n\n" | ||||
|       "#define L2_LINESIZE\t64\n" | ||||
|       "#define L2_ASSOCIATIVE\t8\n" | ||||
|       "#define DTB_DEFAULT_ENTRIES\t48\n" | ||||
|       "#define DTB_SIZE\t4096\n" | ||||
|       "#define HAVE_VFPV4\n" | ||||
|       "#define HAVE_VFPV3\n" | ||||
|       "#define HAVE_VFP\n" | ||||
|       "#define HAVE_NEON\n" | ||||
|       "#define HAVE_SVE\n" | ||||
|       "#define ARMV8\n") | ||||
|     set(SGEMM_UNROLL_M 16) | ||||
|     set(SGEMM_UNROLL_N 4) | ||||
|     set(DGEMM_UNROLL_M 8) | ||||
|     set(DGEMM_UNROLL_N 4) | ||||
|     set(CGEMM_UNROLL_M 8) | ||||
|     set(CGEMM_UNROLL_N 4) | ||||
|     set(ZGEMM_UNROLL_M 4) | ||||
|     set(ZGEMM_UNROLL_N 4) | ||||
|     set(SYMV_P 16) | ||||
|   elseif ("${TCORE}" STREQUAL "FALKOR") | ||||
|     file(APPEND ${TARGET_CONF_TEMP} | ||||
|       "#define L1_CODE_SIZE\t65536\n" | ||||
|  |  | |||
|  | @ -125,7 +125,7 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
|     if (NOT "${line_match}" STREQUAL "") | ||||
|       #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") | ||||
|       set (ElseSeen 0) | ||||
|       if (DEFINED ${CMAKE_MATCH_2}) | ||||
|       if (${CMAKE_MATCH_2}) | ||||
|         if (${CMAKE_MATCH_1} STREQUAL "ifdef") | ||||
|           #message (STATUS "condition is true") | ||||
|           set (IfElse 1) | ||||
|  |  | |||
							
								
								
									
										4
									
								
								common.h
								
								
								
								
							
							
						
						
									
										4
									
								
								common.h
								
								
								
								
							|  | @ -474,6 +474,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| #include "common_loongarch64.h" | ||||
| #endif | ||||
| 
 | ||||
| #ifdef ARCH_E2K | ||||
| #include "common_e2k.h" | ||||
| #endif | ||||
| 
 | ||||
| #ifndef ASSEMBLER | ||||
| #ifdef OS_WINDOWSSTORE | ||||
| typedef char env_var_t[MAX_PATH]; | ||||
|  |  | |||
|  | @ -0,0 +1,64 @@ | |||
| /*****************************************************************************
 | ||||
| Copyright (c) 2011-2016, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| 
 | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 
 | ||||
|    1. Redistributions of source code must retain the above copyright | ||||
|       notice, this list of conditions and the following disclaimer. | ||||
| 
 | ||||
|    2. Redistributions in binary form must reproduce the above copyright | ||||
|       notice, this list of conditions and the following disclaimer in | ||||
|       the documentation and/or other materials provided with the | ||||
|       distribution. | ||||
|    3. Neither the name of the OpenBLAS project nor the names of  | ||||
|       its contributors may be used to endorse or promote products  | ||||
|       derived from this software without specific prior written  | ||||
|       permission. | ||||
| 
 | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| **********************************************************************************/ | ||||
| 
 | ||||
| #ifndef COMMON_E2K | ||||
| #define COMMON_E2K | ||||
| 
 | ||||
| #ifdef ASSEMBLER | ||||
| #error | ||||
| #endif | ||||
| 
 | ||||
| #define MB do { __asm__ __volatile__("": : :"memory"); } while (0) | ||||
| #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) | ||||
| #define RMB | ||||
| 
 | ||||
| #define INLINE __attribute__((__always_inline__)) inline | ||||
| 
 | ||||
| static inline int blas_quickdivide(blasint x, blasint y) { | ||||
|   return x / y; | ||||
| } | ||||
| 
 | ||||
| #ifndef PAGESIZE | ||||
| #define PAGESIZE	( 4 << 10) | ||||
| #endif | ||||
| #define HUGE_PAGESIZE	( 2 << 20) | ||||
| 
 | ||||
| #ifndef BUFFERSIZE | ||||
| #define BUFFER_SIZE	(32 << 20) | ||||
| #else | ||||
| #define BUFFER_SIZE	(32 << BUFFERSIZE) | ||||
| #endif | ||||
| 
 | ||||
| #define SEEK_ADDRESS | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
|  | @ -2611,7 +2611,7 @@ | |||
| 
 | ||||
| #ifndef ASSEMBLER | ||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ | ||||
| || defined(ARCH_LOONGARCH64) | ||||
| || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) | ||||
| extern BLASLONG gemm_offset_a; | ||||
| extern BLASLONG gemm_offset_b; | ||||
| extern BLASLONG sbgemm_p; | ||||
|  |  | |||
|  | @ -43,6 +43,8 @@ size_t length64=sizeof(value64); | |||
| #define CPU_CORTEXA72     4 | ||||
| #define CPU_CORTEXA73     5 | ||||
| #define CPU_NEOVERSEN1    11 | ||||
| #define CPU_NEOVERSEV1    16 | ||||
| #define CPU_NEOVERSEN2    17 | ||||
| // Qualcomm
 | ||||
| #define CPU_FALKOR        6 | ||||
| // Cavium
 | ||||
|  | @ -71,6 +73,8 @@ static char *cpuname[] = { | |||
|   "TSV110", | ||||
|   "EMAG8180", | ||||
|   "NEOVERSEN1", | ||||
|   "NEOVERSEV1" | ||||
|   "NEOVERSEN2" | ||||
|   "THUNDERX3T110", | ||||
|   "VORTEX", | ||||
|   "CORTEXA55", | ||||
|  | @ -90,6 +94,8 @@ static char *cpuname_lower[] = { | |||
|   "tsv110", | ||||
|   "emag8180", | ||||
|   "neoversen1", | ||||
|   "neoversev1", | ||||
|   "neoversen2", | ||||
|   "thunderx3t110", | ||||
|   "vortex", | ||||
|   "cortexa55", | ||||
|  | @ -170,6 +176,10 @@ int detect(void) | |||
|         return CPU_CORTEXA73; | ||||
|       else if (strstr(cpu_part, "0xd0c")) | ||||
|         return CPU_NEOVERSEN1; | ||||
|       else if (strstr(cpu_part, "0xd40")) | ||||
|         return CPU_NEOVERSEV1; | ||||
|       else if (strstr(cpu_part, "0xd49")) | ||||
|         return CPU_NEOVERSEN2; | ||||
|       else if (strstr(cpu_part, "0xd05")) | ||||
| 	return CPU_CORTEXA55; | ||||
|     } | ||||
|  | @ -338,11 +348,41 @@ void get_cpuconfig(void) | |||
| 		printf("#define L1_DATA_ASSOCIATIVE 4\n"); | ||||
| 		printf("#define L2_SIZE 1048576\n"); | ||||
| 		printf("#define L2_LINESIZE 64\n"); | ||||
| 		printf("#define L2_ASSOCIATIVE 16\n"); | ||||
| 		printf("#define DTB_DEFAULT_ENTRIES 64\n"); | ||||
| 		printf("#define L2_ASSOCIATIVE 8\n"); | ||||
| 		printf("#define DTB_DEFAULT_ENTRIES 48\n"); | ||||
| 		printf("#define DTB_SIZE 4096\n"); | ||||
| 		break; | ||||
| 
 | ||||
| 	    case CPU_NEOVERSEV1: | ||||
|                 printf("#define %s\n", cpuname[d]); | ||||
|                 printf("#define L1_CODE_SIZE 65536\n"); | ||||
|                 printf("#define L1_CODE_LINESIZE 64\n"); | ||||
|                 printf("#define L1_CODE_ASSOCIATIVE 4\n"); | ||||
|                 printf("#define L1_DATA_SIZE 65536\n"); | ||||
|                 printf("#define L1_DATA_LINESIZE 64\n"); | ||||
|                 printf("#define L1_DATA_ASSOCIATIVE 4\n"); | ||||
|                 printf("#define L2_SIZE 1048576\n"); | ||||
|                 printf("#define L2_LINESIZE 64\n"); | ||||
|                 printf("#define L2_ASSOCIATIVE 8\n"); | ||||
|                 printf("#define DTB_DEFAULT_ENTRIES 48\n"); | ||||
|                 printf("#define DTB_SIZE 4096\n"); | ||||
|                 break; | ||||
| 
 | ||||
| 	    case CPU_NEOVERSEN2: | ||||
|                 printf("#define %s\n", cpuname[d]); | ||||
|                 printf("#define L1_CODE_SIZE 65536\n"); | ||||
|                 printf("#define L1_CODE_LINESIZE 64\n"); | ||||
|                 printf("#define L1_CODE_ASSOCIATIVE 4\n"); | ||||
|                 printf("#define L1_DATA_SIZE 65536\n"); | ||||
|                 printf("#define L1_DATA_LINESIZE 64\n"); | ||||
|                 printf("#define L1_DATA_ASSOCIATIVE 4\n"); | ||||
|                 printf("#define L2_SIZE 1048576\n"); | ||||
|                 printf("#define L2_LINESIZE 64\n"); | ||||
|                 printf("#define L2_ASSOCIATIVE 8\n"); | ||||
|                 printf("#define DTB_DEFAULT_ENTRIES 48\n"); | ||||
|                 printf("#define DTB_SIZE 4096\n"); | ||||
|                 break; | ||||
| 
 | ||||
| 	    case CPU_FALKOR: | ||||
| 	        printf("#define FALKOR\n"); | ||||
| 	        printf("#define L1_CODE_SIZE 65536\n"); | ||||
|  |  | |||
|  | @ -165,7 +165,7 @@ void get_cpuconfig(void){ | |||
|   }else{ | ||||
|     printf("#define UNKNOWN\n"); | ||||
|   } | ||||
|   if (!get_feature(msa)) printf("#define NO_MSA\n"); | ||||
|   if (!get_feature("msa")) printf("#define NO_MSA\n"); | ||||
| } | ||||
| 
 | ||||
| void get_libname(void){ | ||||
|  | @ -193,7 +193,7 @@ int get_feature(char *search) | |||
|         while (fgets(buffer, sizeof(buffer), infile)) | ||||
|         { | ||||
| 
 | ||||
|                 if (!strncmp("Features", buffer, 8)) | ||||
|                 if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) | ||||
|                 { | ||||
|                         p = strchr(buffer, ':') + 2; | ||||
|                         break; | ||||
|  | @ -207,7 +207,7 @@ int get_feature(char *search) | |||
|         t = strtok(p," "); | ||||
|         while( t = strtok(NULL," ")) | ||||
|         { | ||||
|                 if (!strcmp(t, search))   { return(1); } | ||||
|                 if (strstr(t, search))   { return(1); } | ||||
|         } | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -201,7 +201,7 @@ void get_cpuconfig(void){ | |||
|     printf("#define DTB_SIZE 4096\n"); | ||||
|     printf("#define L2_ASSOCIATIVE 8\n"); | ||||
|   } | ||||
|   if (!get_feature(msa)) printf("#define NO_MSA\n"); | ||||
|   if (!get_feature("msa")) printf("#define NO_MSA\n"); | ||||
| } | ||||
| 
 | ||||
| void get_libname(void){ | ||||
|  | @ -233,7 +233,7 @@ int get_feature(char *search) | |||
|         while (fgets(buffer, sizeof(buffer), infile)) | ||||
|         { | ||||
| 
 | ||||
|                 if (!strncmp("Features", buffer, 8)) | ||||
|                 if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) | ||||
|                 { | ||||
|                         p = strchr(buffer, ':') + 2; | ||||
|                         break; | ||||
|  | @ -247,7 +247,7 @@ int get_feature(char *search) | |||
|         t = strtok(p," "); | ||||
|         while( t = strtok(NULL," ")) | ||||
|         { | ||||
|                 if (!strcmp(t, search))   { return(1); } | ||||
|                 if (strstr(t, search))   { return(1); } | ||||
|         } | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
							
								
								
									
										11
									
								
								cpuid_x86.c
								
								
								
								
							
							
						
						
									
										11
									
								
								cpuid_x86.c
								
								
								
								
							|  | @ -323,9 +323,11 @@ int get_vendor(void){ | |||
| 
 | ||||
| int get_cputype(int gettype){ | ||||
|   int eax, ebx, ecx, edx; | ||||
| /*
 | ||||
|   int extend_family, family; | ||||
|   int extend_model, model; | ||||
|   int type, stepping; | ||||
| */ | ||||
|   int feature = 0; | ||||
| 
 | ||||
|   cpuid(1, &eax, &ebx, &ecx, &edx); | ||||
|  | @ -428,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||
|   cpuid(0, &cpuid_level, &ebx, &ecx, &edx); | ||||
| 
 | ||||
|   if (cpuid_level > 1) { | ||||
|     int numcalls =0 ; | ||||
|     int numcalls; | ||||
|      | ||||
|     cpuid(2, &eax, &ebx, &ecx, &edx); | ||||
|     numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries
 | ||||
|     info[ 0] = BITMASK(eax,  8, 0xff); | ||||
|  | @ -1492,6 +1495,10 @@ int get_cpuname(void){ | |||
|         switch (model) { | ||||
|         case 7: // Alder Lake desktop
 | ||||
|         case 10: // Alder Lake mobile
 | ||||
| 	  if(support_avx512_bf16()) | ||||
|             return CPUTYPE_COOPERLAKE;	 | ||||
|           if(support_avx512()) | ||||
|             return CPUTYPE_SKYLAKEX; | ||||
|           if(support_avx2()) | ||||
|             return CPUTYPE_HASWELL; | ||||
|           if(support_avx()) | ||||
|  | @ -1637,7 +1644,6 @@ int get_cpuname(void){ | |||
| 	  else | ||||
| 	    return CPUTYPE_BARCELONA; | ||||
|         } | ||||
| 	break;	       | ||||
|       case 10: // Zen3		      
 | ||||
| 	if(support_avx()) | ||||
| #ifndef NO_AVX2 | ||||
|  | @ -2193,7 +2199,6 @@ int get_coretype(void){ | |||
| 	  else | ||||
| 	    return CORE_NEHALEM; | ||||
| #endif	 | ||||
|         break;    	 | ||||
| 
 | ||||
|       case 7: | ||||
|         if (model == 10)  | ||||
|  |  | |||
							
								
								
									
										4
									
								
								ctest.c
								
								
								
								
							
							
						
						
									
										4
									
								
								ctest.c
								
								
								
								
							|  | @ -165,3 +165,7 @@ ARCH_LOONGARCH64 | |||
| HAVE_C11 | ||||
| #endif | ||||
| 
 | ||||
| #if defined(__e2k__) | ||||
| ARCH_E2K | ||||
| #endif | ||||
| 
 | ||||
|  |  | |||
|  | @ -64,9 +64,9 @@ CBLASOBJS += \ | |||
| 	chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \
 | ||||
| 	chpr_U.$(SUFFIX)  chpr_L.$(SUFFIX)  chpr_V.$(SUFFIX)  chpr_M.$(SUFFIX) \
 | ||||
| 	chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \
 | ||||
| 	csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
 | ||||
| 	cspr_U.$(SUFFIX)  cspr_L.$(SUFFIX)  cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
 | ||||
| 	csyr_U.$(SUFFIX)  csyr_L.$(SUFFIX)  csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
 | ||||
| 	csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) \
 | ||||
| 	cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
 | ||||
| 	csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
 | ||||
| 	ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \
 | ||||
| 	ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \
 | ||||
| 	ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \
 | ||||
|  | @ -92,6 +92,13 @@ CBLASOBJS += \ | |||
| 	ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \
 | ||||
| 	ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) | ||||
| 
 | ||||
| ifndef NO_LAPACK | ||||
| CBLASOBJS += \
 | ||||
| 	cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
 | ||||
| 	cspr_U.$(SUFFIX)  cspr_L.$(SUFFIX)  \
 | ||||
| 	csyr_U.$(SUFFIX)  csyr_L.$(SUFFIX)   | ||||
| endif | ||||
| 
 | ||||
| ZBLASOBJS += \
 | ||||
| 	zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \
 | ||||
| 	zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \
 | ||||
|  |  | |||
|  | @ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| 	    /* REAL / Double */ | ||||
| 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | ||||
| 			  double *, BLASLONG, double *, BLASLONG, | ||||
| 			  double *, BLASLONG, void *) = func; | ||||
| 			  double *, BLASLONG, void *) =  (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG,  | ||||
| 			  double *, BLASLONG, double *, BLASLONG, void *)) func; | ||||
| 
 | ||||
| 	    afunc(args -> m, args -> n, args -> k, | ||||
| 		  ((double *)args -> alpha)[0], | ||||
|  | @ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
|             /* REAL / Single */ | ||||
|             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | ||||
|                           float *, BLASLONG, float *, BLASLONG, | ||||
|                           float *, BLASLONG, void *) = func; | ||||
|                           float *, BLASLONG, void *) = (void (*) | ||||
|                           (BLASLONG, BLASLONG, BLASLONG, float, | ||||
|                           float *, BLASLONG, float *, BLASLONG, | ||||
|                           float *, BLASLONG, void *)) func; | ||||
| 
 | ||||
|             afunc(args -> m, args -> n, args -> k, | ||||
|                   ((float *)args -> alpha)[0], | ||||
|  | @ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
|             /* REAL / BFLOAT16 */ | ||||
|             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | ||||
|                           bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | ||||
|                           bfloat16 *, BLASLONG, void *) = func; | ||||
|                           bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | ||||
|                           bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | ||||
|                           bfloat16 *, BLASLONG, void *)) func; | ||||
| 
 | ||||
|             afunc(args -> m, args -> n, args -> k, | ||||
|                   ((bfloat16 *)args -> alpha)[0], | ||||
|  | @ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
|             /* REAL / BLAS_STOBF16 */ | ||||
|             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | ||||
|                           float *, BLASLONG, bfloat16 *, BLASLONG, | ||||
|                           float *, BLASLONG, void *) = func; | ||||
|                           float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, | ||||
|                           float *, BLASLONG, bfloat16 *, BLASLONG, | ||||
|                           float *, BLASLONG, void *)) func; | ||||
| 
 | ||||
|             afunc(args -> m, args -> n, args -> k, | ||||
|                   ((float *)args -> alpha)[0], | ||||
|  | @ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
|             /* REAL / BLAS_DTOBF16 */ | ||||
|             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | ||||
|                           double *, BLASLONG, bfloat16 *, BLASLONG, | ||||
|                           double *, BLASLONG, void *) = func; | ||||
|                           double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, | ||||
|                           double *, BLASLONG, bfloat16 *, BLASLONG, | ||||
|                           double *, BLASLONG, void *)) func; | ||||
| 
 | ||||
|             afunc(args -> m, args -> n, args -> k, | ||||
|                   ((double *)args -> alpha)[0], | ||||
|  | @ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| 	  /* COMPLEX / Extended Double */ | ||||
| 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | ||||
| 			xdouble *, BLASLONG, xdouble *, BLASLONG, | ||||
| 			xdouble *, BLASLONG, void *) = func; | ||||
| 			xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | ||||
|                         xdouble *, BLASLONG, xdouble *, BLASLONG, | ||||
|                         xdouble *, BLASLONG, void *)) func; | ||||
| 
 | ||||
| 	  afunc(args -> m, args -> n, args -> k, | ||||
| 		((xdouble *)args -> alpha)[0], | ||||
|  | @ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| 	    /* COMPLEX / Double */ | ||||
| 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, | ||||
| 			double *, BLASLONG, double *, BLASLONG, | ||||
| 			double *, BLASLONG, void *) = func; | ||||
| 			double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double, | ||||
|                         double *, BLASLONG, double *, BLASLONG, | ||||
|                         double *, BLASLONG, void *)) func; | ||||
| 
 | ||||
| 	  afunc(args -> m, args -> n, args -> k, | ||||
| 		((double *)args -> alpha)[0], | ||||
|  | @ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| 	    /* COMPLEX / Single */ | ||||
| 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, | ||||
| 			float *, BLASLONG, float *, BLASLONG, | ||||
| 			float *, BLASLONG, void *) = func; | ||||
| 			float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float, | ||||
|                         float *, BLASLONG, float *, BLASLONG, | ||||
|                         float *, BLASLONG, void *)) func; | ||||
| 
 | ||||
| 	  afunc(args -> m, args -> n, args -> k, | ||||
| 		((float *)args -> alpha)[0], | ||||
|  | @ -425,7 +441,7 @@ blas_queue_t *tscq; | |||
| #endif | ||||
| 
 | ||||
|     if (queue) { | ||||
|       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | ||||
|       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; | ||||
| 
 | ||||
|       atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); | ||||
| 
 | ||||
|  | @ -503,7 +519,7 @@ blas_queue_t *tscq; | |||
| 	legacy_exec(routine, queue -> mode, queue -> args, sb); | ||||
|       } else | ||||
| 	if (queue -> mode & BLAS_PTHREAD) { | ||||
| 	  void (*pthreadcompat)(void *) = queue -> routine; | ||||
| 	  void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; | ||||
| 	  (pthreadcompat)(queue -> args); | ||||
| 	} else | ||||
| 	  (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | ||||
|  | @ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
|   fprintf(STDERR, "\n"); | ||||
| #endif | ||||
| 
 | ||||
|   routine = queue -> routine; | ||||
|   routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine; | ||||
| 
 | ||||
|   if (queue -> mode & BLAS_LEGACY) { | ||||
|     legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | ||||
|   } else | ||||
|     if (queue -> mode & BLAS_PTHREAD) { | ||||
|       void (*pthreadcompat)(void *) = queue -> routine; | ||||
|       void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine; | ||||
|       (pthreadcompat)(queue -> args); | ||||
|     } else | ||||
|       (routine)(queue -> args, queue -> range_m, queue -> range_n, | ||||
|  |  | |||
|  | @ -708,8 +708,11 @@ static gotoblas_t *get_coretype(void){ | |||
| 	 | ||||
|       case 9: | ||||
|         if (model == 7 || model == 10) { // Alder Lake
 | ||||
| 	   if(support_avx512_bf16()) | ||||
|              return &gotoblas_COOPERLAKE; | ||||
|           if (support_avx512())  | ||||
| 	    return &gotoblas_SKYLAKEX; | ||||
|           if(support_avx2()){ | ||||
|             openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | ||||
|             return &gotoblas_HASWELL; | ||||
|           } | ||||
|           if(support_avx()) { | ||||
|  |  | |||
|  | @ -147,6 +147,8 @@ static char *corename[] = { | |||
|   "tsv110", | ||||
|   "emag8180", | ||||
|   "neoversen1", | ||||
|   "neoversev1", | ||||
|   "neoversen2", | ||||
|   "thunderx3t110", | ||||
|   "cortexa55", | ||||
|   "unknown" | ||||
|  |  | |||
|  | @ -232,11 +232,11 @@ int get_num_procs(void); | |||
| #else | ||||
| int get_num_procs(void) { | ||||
|   static int nums = 0; | ||||
| 
 | ||||
|   int ret; | ||||
| #if defined(__GLIBC_PREREQ) | ||||
|   cpu_set_t cpuset,*cpusetp; | ||||
|   size_t size; | ||||
|   int ret; | ||||
| 
 | ||||
| #if !__GLIBC_PREREQ(2, 7) | ||||
|   int i; | ||||
| #if !__GLIBC_PREREQ(2, 6) | ||||
|  | @ -249,7 +249,8 @@ int get_num_procs(void) { | |||
| 
 | ||||
| #if defined(USE_OPENMP) | ||||
| #if _OPENMP >= 201511 | ||||
|     nums = omp_get_num_places(); | ||||
|     ret = omp_get_num_places(); | ||||
|     if (ret >0 ) nums = ret; | ||||
| #endif | ||||
|     return nums; | ||||
| #endif | ||||
|  | @ -1800,11 +1801,12 @@ int get_num_procs(void); | |||
| int get_num_procs(void) { | ||||
| 
 | ||||
|   static int nums = 0; | ||||
|   int ret; | ||||
| 	 | ||||
| #if defined(__GLIBC_PREREQ) | ||||
|   cpu_set_t cpuset,*cpusetp; | ||||
|   size_t size; | ||||
|   int ret; | ||||
| 
 | ||||
| #if !__GLIBC_PREREQ(2, 7) | ||||
|   int i; | ||||
| #if !__GLIBC_PREREQ(2, 6) | ||||
|  | @ -1818,7 +1820,8 @@ int get_num_procs(void) { | |||
| #if defined(USE_OPENMP) | ||||
| /*  if (omp_get_proc_bind() != omp_proc_bind_false) */ | ||||
| #if _OPENMP >= 201511 | ||||
|     nums = omp_get_num_places();	   | ||||
|     ret = omp_get_num_places(); | ||||
|     if (ret >0 ) nums = ret; | ||||
| #endif | ||||
|     return nums; | ||||
| #endif | ||||
|  |  | |||
|  | @ -142,10 +142,14 @@ ifneq (,$(filter 1 2,$(NOFORTRAN))) | |||
| else | ||||
| ifeq ($(F_COMPILER), INTEL) | ||||
| 	$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def | ||||
| else | ||||
| ifeq ($(F_COMPILER), FLANG) | ||||
| 	$(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB) | ||||
| else | ||||
| 	$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB) | ||||
| endif | ||||
| endif | ||||
| endif | ||||
| 
 | ||||
| dllinit.$(SUFFIX) : dllinit.c | ||||
| 	$(CC) $(CFLAGS) -c -o $(@F) -s $< | ||||
|  |  | |||
							
								
								
									
										1
									
								
								f_check
								
								
								
								
							
							
						
						
									
										1
									
								
								f_check
								
								
								
								
							|  | @ -361,6 +361,7 @@ if ($link ne "") { | |||
| 	    ($flags =~ /^\-l/) | ||||
| 	    && ($flags !~ /ibrary/) | ||||
| 	    && ($flags !~ /gfortranbegin/) | ||||
| 	    && ($flags !~ /flangmain/) | ||||
| 	    && ($flags !~ /frtbegin/) | ||||
| 	    && ($flags !~ /pathfstart/) | ||||
| 	    && ($flags !~ /crt[0-9]/) | ||||
|  |  | |||
							
								
								
									
										48
									
								
								getarch.c
								
								
								
								
							
							
						
						
									
										48
									
								
								getarch.c
								
								
								
								
							|  | @ -1302,12 +1302,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
|        "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | ||||
|        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
|        "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ | ||||
|        "-march=armv8.2-a -mtune=cortex-a72" | ||||
|        "-march=armv8.2-a -mtune=neoverse-n1" | ||||
| #define LIBNAME   "neoversen1" | ||||
| #define CORENAME  "NEOVERSEN1" | ||||
| #else | ||||
| #endif | ||||
| 
 | ||||
| #ifdef FORCE_NEOVERSEV1 | ||||
| #define FORCE | ||||
| #define ARCHITECTURE    "ARM64" | ||||
| #define SUBARCHITECTURE "NEOVERSEV1" | ||||
| #define SUBDIRNAME      "arm64" | ||||
| #define ARCHCONFIG   "-DNEOVERSEV1 " \ | ||||
|        "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | ||||
|        "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | ||||
|        "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | ||||
|        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
|        "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ | ||||
|        "-march=armv8.4-a -mtune=neoverse-v1" | ||||
| #define LIBNAME   "neoversev1" | ||||
| #define CORENAME  "NEOVERSEV1" | ||||
| #else | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| #ifdef FORCE_NEOVERSEN2 | ||||
| #define FORCE | ||||
| #define ARCHITECTURE    "ARM64" | ||||
| #define SUBARCHITECTURE "NEOVERSEN2" | ||||
| #define SUBDIRNAME      "arm64" | ||||
| #define ARCHCONFIG   "-DNEOVERSEN2 " \ | ||||
|        "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | ||||
|        "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | ||||
|        "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | ||||
|        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | ||||
|        "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ | ||||
|        "-march=armv8.5-a -mtune=neoverse-n2" | ||||
| #define LIBNAME   "neoversen2" | ||||
| #define CORENAME  "NEOVERSEN2" | ||||
| #else | ||||
| #endif | ||||
| 
 | ||||
| #ifdef FORCE_CORTEXA55 | ||||
| #define FORCE | ||||
| #define ARCHITECTURE    "ARM64" | ||||
|  | @ -1501,6 +1536,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| #if defined(FORCE_E2K) || defined(__e2k__) | ||||
| #define FORCE | ||||
| #define ARCHITECTURE "E2K" | ||||
| #define ARCHCONFIG   "-DGENERIC " \ | ||||
| 		     "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ | ||||
| 		     "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ | ||||
| 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | ||||
| #define LIBNAME   "generic" | ||||
| #define CORENAME  "generic" | ||||
| #endif | ||||
| 
 | ||||
| #ifndef FORCE | ||||
| 
 | ||||
| #ifdef USER_TARGET | ||||
|  |  | |||
|  | @ -28,14 +28,21 @@ set(BLAS1_MANGLED_SOURCES | |||
| # these all have 'z' sources for complex versions | ||||
| set(BLAS2_SOURCES | ||||
|   gemv.c ger.c | ||||
|   trsv.c trmv.c symv.c | ||||
|   syr.c syr2.c gbmv.c | ||||
|   sbmv.c spmv.c | ||||
|   spr.c spr2.c | ||||
|   trsv.c trmv.c  | ||||
|   syr2.c gbmv.c | ||||
|   sbmv.c  | ||||
|   spr2.c | ||||
|   tbsv.c tbmv.c | ||||
|   tpsv.c tpmv.c | ||||
| ) | ||||
| 
 | ||||
| set(BLAS2_REAL_ONLY_SOURCES | ||||
|   symv.c syr.c spmv.c spr.c | ||||
| ) | ||||
| set(BLAS2_COMPLEX_LAPACK_SOURCES | ||||
|   symv.c syr.c spmv.c spr.c | ||||
| ) | ||||
| 
 | ||||
| set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES | ||||
|   hemv.c hbmv.c | ||||
|   her.c her2.c | ||||
|  | @ -78,6 +85,10 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) | |||
|   GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) | ||||
|   GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | ||||
|   GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | ||||
|   GenerateNamedObjects("${BLAS2_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) | ||||
|   if (NOT DEFINED NO_LAPACK) | ||||
|   GenerateNamedObjects("${BLAS2_COMPLEX_LAPACK_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | ||||
|   endif () | ||||
|   GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4) | ||||
|   GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) | ||||
|   GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | ||||
|  |  | |||
|  | @ -1016,11 +1016,13 @@ dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c | |||
| qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| 
 | ||||
| ifndef NO_LAPACK | ||||
| csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| 
 | ||||
| zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| endif | ||||
| 
 | ||||
| xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
|  | @ -1034,11 +1036,13 @@ dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c | |||
| qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| 
 | ||||
| ifndef NO_LAPACK | ||||
| csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| 
 | ||||
| zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| endif | ||||
| 
 | ||||
| xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
|  | @ -1106,11 +1110,13 @@ dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c | |||
| qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| 
 | ||||
| ifndef NO_LAPACK | ||||
| cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| 
 | ||||
| zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| endif | ||||
| 
 | ||||
| xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
|  | @ -1124,11 +1130,13 @@ dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c | |||
| qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| 
 | ||||
| ifndef NO_LAPACK | ||||
| cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| 
 | ||||
| zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| endif | ||||
| 
 | ||||
| xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c | ||||
| 	$(CC) -c $(CFLAGS) $< -o $(@F) | ||||
|  |  | |||
|  | @ -115,7 +115,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||
| #endif | ||||
| 
 | ||||
|     blas_level1_thread(mode, n, 0, 0, &alpha, | ||||
| 		       x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); | ||||
| 		       x, incx, y, incy, NULL, 0,  (int (*)(void))AXPYU_K, nthreads); | ||||
| 
 | ||||
|   } | ||||
| #endif | ||||
|  |  | |||
|  | @ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ | |||
| #else | ||||
| 		       &alpha, | ||||
| #endif | ||||
| 		       x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); | ||||
| 		       x, incx, NULL, 0, NULL, 0,  (int (*)(void))SCAL_K, nthreads); | ||||
| 
 | ||||
|   } | ||||
| #endif | ||||
|  |  | |||
|  | @ -128,9 +128,9 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||
| 
 | ||||
|     blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, | ||||
| #ifndef CONJ | ||||
| 		       (void *)AXPYU_K, | ||||
|                        (int (*)(void))AXPYU_K, | ||||
| #else | ||||
| 		       (void *)AXPYC_K, | ||||
|                        (int (*)(void))AXPYC_K, | ||||
| #endif | ||||
| 		       nthreads); | ||||
|   } | ||||
|  |  | |||
|  | @ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||
|     mode  =  BLAS_SINGLE | BLAS_COMPLEX; | ||||
| #endif | ||||
| 
 | ||||
|     blas_level1_thread(mode, n, 0, 0,  alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); | ||||
|     blas_level1_thread(mode, n, 0, 0,  alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); | ||||
| 
 | ||||
|   } | ||||
| #endif | ||||
|  |  | |||
|  | @ -323,55 +323,93 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| 
 | ||||
| 
 | ||||
|         #hemm | ||||
|       GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) | ||||
| if (NOT DEFINED ${float_char}HEMMUTCOPY_M) | ||||
|     set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
|     set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
| else () | ||||
|     set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}") | ||||
|     set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}") | ||||
| endif() | ||||
|       GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       # symm for c and z | ||||
| if (NOT DEFINED ${float_char}SYMMUCOPY_M) | ||||
| 	set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
| 	set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
| else () | ||||
| 	set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") | ||||
| 	set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") | ||||
| endif() | ||||
|       GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
| if (NOT DEFINED ${float_char}TRMMUNCOPY_M) | ||||
| 	set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
| 	set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
| 	set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
| 	set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
| else () | ||||
| 	set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") | ||||
| 	set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") | ||||
| 	set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") | ||||
| 	set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") | ||||
| endif () | ||||
|       GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
| if (NOT DEFINED ZTRSMCOPYLN_M) | ||||
|   set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
|   set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
|   set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
|   set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
| else () | ||||
|   set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}") | ||||
|   set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}") | ||||
|   set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}") | ||||
|   set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}") | ||||
| endif () | ||||
|       GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|  | @ -465,23 +503,35 @@ endif () | |||
|       GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
| if (NOT DEFINED TRSMCOPYLN_M) | ||||
|   set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
|   set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
|   set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
|   set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | ||||
| else () | ||||
|   set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}") | ||||
|   set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}") | ||||
|   set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}") | ||||
|   set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}") | ||||
| endif () | ||||
|       GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) | ||||
|       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) | ||||
| 
 | ||||
|  |  | |||
|  | @ -617,6 +617,10 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) | |||
| $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | ||||
| 	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | ||||
| 
 | ||||
| ifeq ($(ARCH), E2K) | ||||
| USE_TRMM = 1 | ||||
| endif | ||||
| 
 | ||||
| 
 | ||||
| ifeq ($(BUILD_BFLOAT16), 1) | ||||
| 
 | ||||
|  | @ -1691,29 +1695,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N | |||
| $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | ||||
| 
 | ||||
| ifdef CTRMMUNCOPY_M | ||||
| $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | ||||
| ifdef CTRMMLNCOPY_M | ||||
| $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | ||||
| $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef CTRMMUTCOPY_M | ||||
| $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | ||||
| ifdef CTRMMLTCOPY_M | ||||
| $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | ||||
| $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | ||||
|  | @ -1739,29 +1775,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_ | |||
| $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | ||||
| 
 | ||||
| ifdef ZTRMMUNCOPY_M | ||||
| $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRMMLNCOPY_M | ||||
| $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRMMUTCOPY_M | ||||
| $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRMMLTCOPY_M | ||||
| $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | ||||
|  | @ -1897,11 +1965,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N) | |||
| $(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ | ||||
| 
 | ||||
| ifdef CSYMMUCOPY_M | ||||
| $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | ||||
| else | ||||
| $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef CSYMMLCOPY_M | ||||
| $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | ||||
| else | ||||
| $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ | ||||
|  | @ -1909,11 +1987,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N) | |||
| $(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ | ||||
| 
 | ||||
| ifdef ZSYMMUCOPY_M | ||||
| $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | ||||
| else | ||||
| $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZSYMMLCOPY_M | ||||
| $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | ||||
| else | ||||
| $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ | ||||
|  | @ -1933,11 +2021,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N | |||
| $(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ | ||||
| 
 | ||||
| ifdef CHEMMUTCOPY_M | ||||
| $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | ||||
| else | ||||
| $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef CHEMMLTCOPY_M | ||||
| $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | ||||
| else | ||||
| $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ | ||||
|  | @ -1945,11 +2043,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N | |||
| $(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ | ||||
| 
 | ||||
| ifdef ZHEMMUTCOPY_M | ||||
| $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | ||||
| else | ||||
| $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZHEMMLTCOPY_M | ||||
| $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | ||||
| else | ||||
| $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ | ||||
|  | @ -2287,29 +2395,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR | |||
| $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c | ||||
| 	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | ||||
| 
 | ||||
| ifdef TRSMCOPYUN_M | ||||
| $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef TRSMCOPYLN_M | ||||
| $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef TRSMCOPYUT_M | ||||
| $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef TRSMCOPYLT_M | ||||
| $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | ||||
|  | @ -2335,29 +2475,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N | |||
| $(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | ||||
| 
 | ||||
| ifdef TRSMCOPYUN_M | ||||
| $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef TRSMCOPYLN_M | ||||
| $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef TRSMCOPYUT_M | ||||
| $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef TRSMCOPYLT_M | ||||
| $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | ||||
|  | @ -2431,29 +2603,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N | |||
| $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | ||||
| 
 | ||||
| ifdef ZTRSMCOPYUN_M | ||||
| $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRSMCOPYLN_M | ||||
| $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRSMCOPYUT_M | ||||
| $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRSMCOPYLT_M | ||||
| $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | ||||
|  | @ -2479,29 +2683,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_ | |||
| $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | ||||
| 
 | ||||
| ifdef ZTRSMCOPYUN_M | ||||
| $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRSMCOPYLN_M | ||||
| $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRSMCOPYUT_M | ||||
| $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRSMCOPYLT_M | ||||
| $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| else | ||||
| $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c | ||||
| 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | ||||
|  |  | |||
|  | @ -20,25 +20,36 @@ IDMAXKERNEL  = ../arm/imax.c | |||
| ISMINKERNEL  = ../arm/imin.c | ||||
| IDMINKERNEL  = ../arm/imin.c | ||||
| 
 | ||||
| STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c | ||||
| STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c | ||||
| STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c | ||||
| STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c | ||||
| STRSMKERNEL_LN	= trsm_kernel_LN_sve.c | ||||
| STRSMKERNEL_LT	= trsm_kernel_LT_sve.c | ||||
| STRSMKERNEL_RN	= trsm_kernel_RN_sve.c | ||||
| STRSMKERNEL_RT	= trsm_kernel_RT_sve.c | ||||
| 
 | ||||
| DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c | ||||
| DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c | ||||
| DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c | ||||
| DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c | ||||
| 
 | ||||
| CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| TRSMCOPYLN_M    = trsm_lncopy_sve.c | ||||
| TRSMCOPYLT_M    = trsm_ltcopy_sve.c | ||||
| TRSMCOPYUN_M    = trsm_uncopy_sve.c | ||||
| TRSMCOPYUT_M    = trsm_utcopy_sve.c | ||||
| 
 | ||||
| CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c | ||||
| CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c | ||||
| CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c | ||||
| CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c | ||||
| 
 | ||||
| ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c | ||||
| ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c | ||||
| ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c | ||||
| ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c | ||||
| 
 | ||||
| ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c | ||||
| ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c | ||||
| ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c | ||||
| ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c | ||||
| 
 | ||||
| ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| SAMAXKERNEL  = amax.S | ||||
| DAMAXKERNEL  = amax.S | ||||
|  | @ -156,28 +167,50 @@ DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M    =  symm_ucopy_sve.c | ||||
| DSYMMLCOPY_M    =  symm_lcopy_sve.c | ||||
| 
 | ||||
| CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | ||||
| CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | ||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | ||||
| CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | ||||
| CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | ||||
| CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | ||||
| CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | ||||
| 
 | ||||
| CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c | ||||
| CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c | ||||
| CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | ||||
| CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | ||||
| 
 | ||||
| CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | ||||
| CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | ||||
| CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | ||||
| ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | ||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | ||||
| ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | ||||
| ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | ||||
| ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c | ||||
| CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c | ||||
| CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c | ||||
| CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c | ||||
| 
 | ||||
| CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c | ||||
| CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c | ||||
| 
 | ||||
| CSYMMUCOPY_M    =  zsymm_ucopy_sve.c | ||||
| CSYMMLCOPY_M    =  zsymm_lcopy_sve.c | ||||
| 
 | ||||
| ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | ||||
| ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | ||||
| 
 | ||||
| ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c | ||||
| ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c | ||||
| ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | ||||
| ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | ||||
| 
 | ||||
| ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c | ||||
| ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c | ||||
| ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c | ||||
| ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c | ||||
| 
 | ||||
| ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c | ||||
| ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c | ||||
| 
 | ||||
| ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c | ||||
| ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c | ||||
|  |  | |||
|  | @ -20,25 +20,36 @@ IDMAXKERNEL  = ../arm/imax.c | |||
| ISMINKERNEL  = ../arm/imin.c | ||||
| IDMINKERNEL  = ../arm/imin.c | ||||
| 
 | ||||
| STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c | ||||
| STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c | ||||
| STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c | ||||
| STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c | ||||
| STRSMKERNEL_LN	= trsm_kernel_LN_sve.c | ||||
| STRSMKERNEL_LT	= trsm_kernel_LT_sve.c | ||||
| STRSMKERNEL_RN	= trsm_kernel_RN_sve.c | ||||
| STRSMKERNEL_RT	= trsm_kernel_RT_sve.c | ||||
| 
 | ||||
| DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c | ||||
| DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c | ||||
| DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c | ||||
| DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c | ||||
| 
 | ||||
| CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| TRSMCOPYLN_M    = trsm_lncopy_sve.c | ||||
| TRSMCOPYLT_M    = trsm_ltcopy_sve.c | ||||
| TRSMCOPYUN_M    = trsm_uncopy_sve.c | ||||
| TRSMCOPYUT_M    = trsm_utcopy_sve.c | ||||
| 
 | ||||
| CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c | ||||
| CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c | ||||
| CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c | ||||
| CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c | ||||
| 
 | ||||
| ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c | ||||
| ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c | ||||
| ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c | ||||
| ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c | ||||
| 
 | ||||
| ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c | ||||
| ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c | ||||
| ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c | ||||
| ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c | ||||
| 
 | ||||
| ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| SAMAXKERNEL  = amax.S | ||||
| DAMAXKERNEL  = amax.S | ||||
|  | @ -140,8 +151,8 @@ DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| 
 | ||||
| DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c | ||||
| DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c | ||||
| DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | ||||
| DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | ||||
| DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S | ||||
| DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S | ||||
| 
 | ||||
| DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
|  | @ -156,28 +167,50 @@ DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M    =  symm_ucopy_sve.c | ||||
| DSYMMLCOPY_M    =  symm_lcopy_sve.c | ||||
| 
 | ||||
| CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | ||||
| CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | ||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | ||||
| CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | ||||
| CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | ||||
| CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | ||||
| CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | ||||
| 
 | ||||
| CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c | ||||
| CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c | ||||
| CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | ||||
| CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | ||||
| 
 | ||||
| CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | ||||
| CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | ||||
| CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | ||||
| ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | ||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | ||||
| ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | ||||
| ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | ||||
| ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c | ||||
| CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c | ||||
| CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c | ||||
| CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c | ||||
| 
 | ||||
| CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c | ||||
| CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c | ||||
| 
 | ||||
| CSYMMUCOPY_M    =  zsymm_ucopy_sve.c | ||||
| CSYMMLCOPY_M    =  zsymm_lcopy_sve.c | ||||
| 
 | ||||
| ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | ||||
| ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | ||||
| 
 | ||||
| ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c | ||||
| ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c | ||||
| ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | ||||
| ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | ||||
| 
 | ||||
| ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c | ||||
| ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c | ||||
| ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c | ||||
| ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c | ||||
| 
 | ||||
| ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c | ||||
| ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c | ||||
| 
 | ||||
| ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c | ||||
| ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c | ||||
|  |  | |||
|  | @ -0,0 +1,189 @@ | |||
| SAMINKERNEL  = ../arm/amin.c | ||||
| DAMINKERNEL  = ../arm/amin.c | ||||
| CAMINKERNEL  = ../arm/zamin.c | ||||
| ZAMINKERNEL  = ../arm/zamin.c | ||||
| 
 | ||||
| SMAXKERNEL   = ../arm/max.c | ||||
| DMAXKERNEL   = ../arm/max.c | ||||
| 
 | ||||
| SMINKERNEL   = ../arm/min.c | ||||
| DMINKERNEL   = ../arm/min.c | ||||
| 
 | ||||
| ISAMINKERNEL = ../arm/iamin.c | ||||
| IDAMINKERNEL = ../arm/iamin.c | ||||
| ICAMINKERNEL = ../arm/izamin.c | ||||
| IZAMINKERNEL = ../arm/izamin.c | ||||
| 
 | ||||
| ISMAXKERNEL  = ../arm/imax.c | ||||
| IDMAXKERNEL  = ../arm/imax.c | ||||
| 
 | ||||
| ISMINKERNEL  = ../arm/imin.c | ||||
| IDMINKERNEL  = ../arm/imin.c | ||||
| 
 | ||||
| STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c | ||||
| STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c | ||||
| STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c | ||||
| STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| SAMAXKERNEL  = amax.S | ||||
| DAMAXKERNEL  = amax.S | ||||
| CAMAXKERNEL  = zamax.S | ||||
| ZAMAXKERNEL  = zamax.S | ||||
| 
 | ||||
| SAXPYKERNEL  = axpy.S | ||||
| DAXPYKERNEL  = daxpy_thunderx2t99.S | ||||
| CAXPYKERNEL  = zaxpy.S | ||||
| ZAXPYKERNEL  = zaxpy.S | ||||
| 
 | ||||
| SROTKERNEL   = rot.S | ||||
| DROTKERNEL   = rot.S | ||||
| CROTKERNEL   = zrot.S | ||||
| ZROTKERNEL   = zrot.S | ||||
| 
 | ||||
| SSCALKERNEL  = scal.S | ||||
| DSCALKERNEL  = scal.S | ||||
| CSCALKERNEL  = zscal.S | ||||
| ZSCALKERNEL  = zscal.S | ||||
| 
 | ||||
| SGEMVNKERNEL = gemv_n.S | ||||
| DGEMVNKERNEL = gemv_n.S | ||||
| CGEMVNKERNEL = zgemv_n.S | ||||
| ZGEMVNKERNEL = zgemv_n.S | ||||
| 
 | ||||
| SGEMVTKERNEL = gemv_t.S | ||||
| DGEMVTKERNEL = gemv_t.S | ||||
| CGEMVTKERNEL = zgemv_t.S | ||||
| ZGEMVTKERNEL = zgemv_t.S | ||||
| 
 | ||||
| 
 | ||||
| SASUMKERNEL    = sasum_thunderx2t99.c | ||||
| DASUMKERNEL    = dasum_thunderx2t99.c | ||||
| CASUMKERNEL    = casum_thunderx2t99.c | ||||
| ZASUMKERNEL    = zasum_thunderx2t99.c | ||||
| 
 | ||||
| SCOPYKERNEL    = copy_thunderx2t99.c | ||||
| DCOPYKERNEL    = copy_thunderx2t99.c | ||||
| CCOPYKERNEL    = copy_thunderx2t99.c | ||||
| ZCOPYKERNEL    = copy_thunderx2t99.c | ||||
| 
 | ||||
| SSWAPKERNEL    = swap_thunderx2t99.S | ||||
| DSWAPKERNEL    = swap_thunderx2t99.S | ||||
| CSWAPKERNEL    = swap_thunderx2t99.S | ||||
| ZSWAPKERNEL    = swap_thunderx2t99.S | ||||
| 
 | ||||
| ISAMAXKERNEL   = iamax_thunderx2t99.c | ||||
| IDAMAXKERNEL   = iamax_thunderx2t99.c | ||||
| ICAMAXKERNEL   = izamax_thunderx2t99.c | ||||
| IZAMAXKERNEL   = izamax_thunderx2t99.c | ||||
| 
 | ||||
| SNRM2KERNEL    = scnrm2_thunderx2t99.c | ||||
| DNRM2KERNEL    = dznrm2_thunderx2t99.c | ||||
| CNRM2KERNEL    = scnrm2_thunderx2t99.c | ||||
| ZNRM2KERNEL    = dznrm2_thunderx2t99.c | ||||
| 
 | ||||
| DDOTKERNEL     = dot_thunderx2t99.c | ||||
| SDOTKERNEL     = dot_thunderx2t99.c | ||||
| CDOTKERNEL     = zdot_thunderx2t99.c | ||||
| ZDOTKERNEL     = zdot_thunderx2t99.c | ||||
| DSDOTKERNEL    = dot.S | ||||
| 
 | ||||
| DGEMM_BETA     = dgemm_beta.S | ||||
| SGEMM_BETA     = sgemm_beta.S | ||||
| 
 | ||||
| SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | ||||
| STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | ||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | ||||
| ifeq ($(SGEMM_UNROLL_M), 16) | ||||
| SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S | ||||
| else | ||||
| SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | ||||
| endif | ||||
| ifeq ($(SGEMM_UNROLL_M), 4) | ||||
| SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S | ||||
| else | ||||
| SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | ||||
| endif | ||||
| SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| ifeq ($(SGEMM_UNROLL_N), 16) | ||||
| SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S | ||||
| else | ||||
| SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | ||||
| endif | ||||
| ifeq ($(SGEMM_UNROLL_N), 4) | ||||
| SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S | ||||
| else | ||||
| SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | ||||
| endif | ||||
| SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | ||||
| DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | ||||
| 
 | ||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | ||||
| 
 | ||||
| ifeq ($(DGEMM_UNROLL_M), 8) | ||||
| DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S | ||||
| DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S | ||||
| else | ||||
| DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | ||||
| DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | ||||
| endif | ||||
| 
 | ||||
| DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| 
 | ||||
| ifeq ($(DGEMM_UNROLL_N), 4) | ||||
| DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S | ||||
| DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S | ||||
| else | ||||
| DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | ||||
| DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | ||||
| endif | ||||
| 
 | ||||
| DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | ||||
| CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | ||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | ||||
| CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | ||||
| CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | ||||
| CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | ||||
| CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | ||||
| CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | ||||
| ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | ||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | ||||
| ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | ||||
| ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | ||||
| ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | ||||
| ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | ||||
| ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
|  | @ -0,0 +1,189 @@ | |||
| SAMINKERNEL  = ../arm/amin.c | ||||
| DAMINKERNEL  = ../arm/amin.c | ||||
| CAMINKERNEL  = ../arm/zamin.c | ||||
| ZAMINKERNEL  = ../arm/zamin.c | ||||
| 
 | ||||
| SMAXKERNEL   = ../arm/max.c | ||||
| DMAXKERNEL   = ../arm/max.c | ||||
| 
 | ||||
| SMINKERNEL   = ../arm/min.c | ||||
| DMINKERNEL   = ../arm/min.c | ||||
| 
 | ||||
| ISAMINKERNEL = ../arm/iamin.c | ||||
| IDAMINKERNEL = ../arm/iamin.c | ||||
| ICAMINKERNEL = ../arm/izamin.c | ||||
| IZAMINKERNEL = ../arm/izamin.c | ||||
| 
 | ||||
| ISMAXKERNEL  = ../arm/imax.c | ||||
| IDMAXKERNEL  = ../arm/imax.c | ||||
| 
 | ||||
| ISMINKERNEL  = ../arm/imin.c | ||||
| IDMINKERNEL  = ../arm/imin.c | ||||
| 
 | ||||
| STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c | ||||
| STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c | ||||
| STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c | ||||
| STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| SAMAXKERNEL  = amax.S | ||||
| DAMAXKERNEL  = amax.S | ||||
| CAMAXKERNEL  = zamax.S | ||||
| ZAMAXKERNEL  = zamax.S | ||||
| 
 | ||||
| SAXPYKERNEL  = axpy.S | ||||
| DAXPYKERNEL  = daxpy_thunderx2t99.S | ||||
| CAXPYKERNEL  = zaxpy.S | ||||
| ZAXPYKERNEL  = zaxpy.S | ||||
| 
 | ||||
| SROTKERNEL   = rot.S | ||||
| DROTKERNEL   = rot.S | ||||
| CROTKERNEL   = zrot.S | ||||
| ZROTKERNEL   = zrot.S | ||||
| 
 | ||||
| SSCALKERNEL  = scal.S | ||||
| DSCALKERNEL  = scal.S | ||||
| CSCALKERNEL  = zscal.S | ||||
| ZSCALKERNEL  = zscal.S | ||||
| 
 | ||||
| SGEMVNKERNEL = gemv_n.S | ||||
| DGEMVNKERNEL = gemv_n.S | ||||
| CGEMVNKERNEL = zgemv_n.S | ||||
| ZGEMVNKERNEL = zgemv_n.S | ||||
| 
 | ||||
| SGEMVTKERNEL = gemv_t.S | ||||
| DGEMVTKERNEL = gemv_t.S | ||||
| CGEMVTKERNEL = zgemv_t.S | ||||
| ZGEMVTKERNEL = zgemv_t.S | ||||
| 
 | ||||
| 
 | ||||
| SASUMKERNEL    = sasum_thunderx2t99.c | ||||
| DASUMKERNEL    = dasum_thunderx2t99.c | ||||
| CASUMKERNEL    = casum_thunderx2t99.c | ||||
| ZASUMKERNEL    = zasum_thunderx2t99.c | ||||
| 
 | ||||
| SCOPYKERNEL    = copy_thunderx2t99.c | ||||
| DCOPYKERNEL    = copy_thunderx2t99.c | ||||
| CCOPYKERNEL    = copy_thunderx2t99.c | ||||
| ZCOPYKERNEL    = copy_thunderx2t99.c | ||||
| 
 | ||||
| SSWAPKERNEL    = swap_thunderx2t99.S | ||||
| DSWAPKERNEL    = swap_thunderx2t99.S | ||||
| CSWAPKERNEL    = swap_thunderx2t99.S | ||||
| ZSWAPKERNEL    = swap_thunderx2t99.S | ||||
| 
 | ||||
| ISAMAXKERNEL   = iamax_thunderx2t99.c | ||||
| IDAMAXKERNEL   = iamax_thunderx2t99.c | ||||
| ICAMAXKERNEL   = izamax_thunderx2t99.c | ||||
| IZAMAXKERNEL   = izamax_thunderx2t99.c | ||||
| 
 | ||||
| SNRM2KERNEL    = scnrm2_thunderx2t99.c | ||||
| DNRM2KERNEL    = dznrm2_thunderx2t99.c | ||||
| CNRM2KERNEL    = scnrm2_thunderx2t99.c | ||||
| ZNRM2KERNEL    = dznrm2_thunderx2t99.c | ||||
| 
 | ||||
| DDOTKERNEL     = dot_thunderx2t99.c | ||||
| SDOTKERNEL     = dot_thunderx2t99.c | ||||
| CDOTKERNEL     = zdot_thunderx2t99.c | ||||
| ZDOTKERNEL     = zdot_thunderx2t99.c | ||||
| DSDOTKERNEL    = dot.S | ||||
| 
 | ||||
| DGEMM_BETA     = dgemm_beta.S | ||||
| SGEMM_BETA     = sgemm_beta.S | ||||
| 
 | ||||
| SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | ||||
| STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | ||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | ||||
| ifeq ($(SGEMM_UNROLL_M), 16) | ||||
| SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S | ||||
| else | ||||
| SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | ||||
| endif | ||||
| ifeq ($(SGEMM_UNROLL_M), 4) | ||||
| SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S | ||||
| else | ||||
| SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | ||||
| endif | ||||
| SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| ifeq ($(SGEMM_UNROLL_N), 16) | ||||
| SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S | ||||
| else | ||||
| SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | ||||
| endif | ||||
| ifeq ($(SGEMM_UNROLL_N), 4) | ||||
| SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S | ||||
| else | ||||
| SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | ||||
| endif | ||||
| SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | ||||
| DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | ||||
| 
 | ||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | ||||
| 
 | ||||
| ifeq ($(DGEMM_UNROLL_M), 8) | ||||
| DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S | ||||
| DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S | ||||
| else | ||||
| DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | ||||
| DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | ||||
| endif | ||||
| 
 | ||||
| DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| 
 | ||||
| ifeq ($(DGEMM_UNROLL_N), 4) | ||||
| DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S | ||||
| DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S | ||||
| else | ||||
| DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | ||||
| DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | ||||
| endif | ||||
| 
 | ||||
| DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | ||||
| CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | ||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | ||||
| CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | ||||
| CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | ||||
| CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | ||||
| CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | ||||
| CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | ||||
| ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | ||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | ||||
| ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | ||||
| ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | ||||
| ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | ||||
| ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | ||||
| ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | ||||
| ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
|  | @ -0,0 +1,874 @@ | |||
| /******************************************************************************* | ||||
| Copyright (c) 2015, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
| notice, this list of conditions and the following disclaimer. | ||||
| 2. Redistributions in binary form must reproduce the above copyright | ||||
| notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | ||||
| distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | ||||
| its contributors may be used to endorse or promote products | ||||
| derived from this software without specific prior written permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *******************************************************************************/ | ||||
| 
 | ||||
| #define ASSEMBLER | ||||
| #include "common.h" | ||||
| 
 | ||||
| /*                   X0          X1          X2          s0        X3        x4       x5           x6 */ | ||||
| /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ | ||||
| 
 | ||||
| #define origM		x0 | ||||
| #define origN		x1 | ||||
| #define origK		x2 | ||||
| #define origPA		x3 | ||||
| #define origPB		x4 | ||||
| #define pC		x5 | ||||
| #define LDC		x6 | ||||
| #define temp		x7 | ||||
| #define counterL	x8 | ||||
| #define counterI	x9 | ||||
| #define counterJ	x10 | ||||
| #define pB		x11 | ||||
| #define pCRow0		x12 | ||||
| #define pCRow1		x13 | ||||
| #define pCRow2		x14 | ||||
| #define pCRow3		x15 | ||||
| #define pA		x16 | ||||
| #define lanes		x17 | ||||
| 
 | ||||
| #define alphaR		w19 | ||||
| #define alphaI		w20 | ||||
| 
 | ||||
| #define alphaz_R	z6.s | ||||
| #define alphaz_I	z7.s | ||||
| #define alpha0_R	s4 | ||||
| #define alpha0_I	s5 | ||||
| 
 | ||||
| 
 | ||||
| #define A_PRE_SIZE	2560 | ||||
| #define B_PRE_SIZE	448 | ||||
| #define C_PRE_SIZE	128 | ||||
| 
 | ||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| #define OP_rr		fmla | ||||
| #define OP_ii		fmls | ||||
| #define OP_ri		fmla | ||||
| #define OP_ir		fmla | ||||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | ||||
| #define OP_rr		fmla | ||||
| #define OP_ii		fmla | ||||
| #define OP_ri		fmls | ||||
| #define OP_ir		fmla | ||||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | ||||
| #define OP_rr		fmla | ||||
| #define OP_ii		fmla | ||||
| #define OP_ri		fmla | ||||
| #define OP_ir		fmls | ||||
| #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| #define OP_rr		fmla | ||||
| #define OP_ii		fmls | ||||
| #define OP_ri		fmls | ||||
| #define OP_ir		fmls | ||||
| #endif | ||||
| 
 | ||||
| // 00 origM | ||||
| // 01 origN | ||||
| // 02 origK | ||||
| // 03 origPA | ||||
| // 04 origPB | ||||
| // 05 pC | ||||
| // 06 origLDC -> LDC | ||||
| // 07 offset -> temp | ||||
| // 08 counterL | ||||
| // 09 counterI | ||||
| // 10 counterJ | ||||
| // 11 pB | ||||
| // 12 pCRow0 | ||||
| // 13 pCRow1 | ||||
| // 14 pCRow2 | ||||
| // 15 pCRow3 | ||||
| // 16 pA | ||||
| // 17 alpha_save_R | ||||
| // 18 must save alpha_save_I | ||||
| // 19 must save | ||||
| // 20 must save | ||||
| // 21 must save | ||||
| // 22 must save | ||||
| // 23 must save | ||||
| // 24 must save | ||||
| // 25 must save | ||||
| // 26 must save | ||||
| // 27 must save | ||||
| // 28 must save | ||||
| // 29 frame | ||||
| // 30 link | ||||
| // 31 sp | ||||
| 
 | ||||
| //v00 ALPHA_R -> pA00_R, pA01_R | ||||
| //v01 ALPHA_I -> pA00_I, pA01_I | ||||
| //v02 pA02_R, pA03_R | ||||
| //v03 pA02_I, pA03_I | ||||
| //v04 pA10_R, pA11_R | ||||
| //v05 pA10_I, pA11_I | ||||
| //v06 pA12_R, pA13_R | ||||
| //v07 pA12_I, pA13_I | ||||
| //v08 must save pB00_R, pB01_R | ||||
| //v09 must save pB00_I, pB01_I | ||||
| //v10 must save pB02_R, pB03_R OR ALPHA0_R | ||||
| //v11 must save pB02_I, pB03_I OR ALPHA0_I | ||||
| //v12 must save pB10_R, pB11_R | ||||
| //v13 must save pB10_I, pB11_I | ||||
| //v14 must save pB12_R, pB13_R OR ALPHA1_R | ||||
| //v15 must save pB12_I, pB13_I OR ALPHA1_R | ||||
| //v16 pC0R | ||||
| //v17 pC0I | ||||
| //v18 pC1R | ||||
| //v19 pC1I | ||||
| //v20 pC2R | ||||
| //v21 pC2I | ||||
| //v22 pC3R | ||||
| //v23 pC3I | ||||
| //v24 pC3R | ||||
| //v25 pC3I | ||||
| //v26 pC22_R, pC23_R | ||||
| //v27 pC22_I, pC23_I | ||||
| //v28 pC30_R, pC31_R | ||||
| //v29 pC30_I, pC31_I | ||||
| //v30 pC32_R, pC33_R | ||||
| //v31 pC32_I, pC33_I | ||||
| 
 | ||||
| /******************************************************************************* | ||||
| * Macro definitions | ||||
| *******************************************************************************/ | ||||
| 
 | ||||
| .macro INITv1x4
 | ||||
| 	dup		z16.s, #0 | ||||
| 	dup		z17.s, #0 | ||||
| 	dup		z18.s, #0 | ||||
| 	dup		z19.s, #0 | ||||
| 	dup		z20.s, #0 | ||||
| 	dup		z21.s, #0 | ||||
| 	dup		z22.s, #0 | ||||
| 	dup		z23.s, #0 | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_I
 | ||||
| 	ld2w	{z0.s, z1.s}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4 | ||||
| 	ld2w	{z2.s, z3.s}, p1/z, [pA] // next one | ||||
| 	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4 | ||||
| 
 | ||||
|     ld1rw  z8.s, p0/z,  [pB] | ||||
|     ld1rw  z9.s, p0/z,  [pB, 4] | ||||
|     ld1rw  z10.s, p0/z, [pB, 8] | ||||
|     ld1rw  z11.s, p0/z, [pB, 12] | ||||
|     ld1rw  z12.s, p0/z, [pB, 16] | ||||
|     ld1rw  z13.s, p0/z, [pB, 20] | ||||
|     ld1rw  z14.s, p0/z, [pB, 24] | ||||
|     ld1rw  z15.s, p0/z, [pB, 28] | ||||
| 
 | ||||
|     add pB, pB, 32 | ||||
| 
 | ||||
| 	fmla	z16.s, p1/m, z0.s, z8.s | ||||
| 	OP_ir	z17.s, p1/m, z1.s, z8.s | ||||
|     ld1rw  z8.s, p0/z,  [pB] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
|     defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| 	#eor	z17.16b, z17.16b, z17.16b | ||||
| 	fmls	z17.s, p1/m, z0.s, z9.s | ||||
| #else | ||||
| 	fmla	z17.s, p1/m, z0.s, z9.s | ||||
| #endif | ||||
| 	OP_ii	z16.s, p1/m, z1.s, z9.s | ||||
|     ld1rw  z9.s, p0/z,  [pB, 4] | ||||
| 
 | ||||
| 
 | ||||
| 	fmla	z18.s, p1/m, z0.s, z10.s | ||||
| 	OP_ir	z19.s, p1/m, z1.s, z10.s | ||||
|     ld1rw  z10.s, p0/z,  [pB, 8] | ||||
| 	OP_ii	z18.s, p1/m, z1.s, z11.s | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
|     defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| 	#eor	z19.16b, z21.16b, z21.16b | ||||
| 	fmls	z19.s, p1/m, z0.s, z11.s | ||||
| #else | ||||
| 	fmla	z19.s, p1/m, z0.s, z11.s | ||||
| #endif | ||||
|     ld1rw  z11.s, p0/z,  [pB, 12] | ||||
| 
 | ||||
| 
 | ||||
| 	fmla	z20.s, p1/m, z0.s, z12.s | ||||
| 	OP_ir	z21.s, p1/m, z1.s, z12.s | ||||
|     ld1rw  z12.s, p0/z,  [pB, 16] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
|     defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| 	#eor	z21.16b, z23.16b, z23.16b | ||||
| 	fmls	z21.s, p1/m, z0.s, z13.s | ||||
| #else | ||||
| 	fmla	z21.s, p1/m, z0.s, z13.s | ||||
| #endif | ||||
| 	OP_ii	z20.s, p1/m, z1.s, z13.s | ||||
|     ld1rw  z13.s, p0/z,  [pB, 20] | ||||
| 
 | ||||
| 
 | ||||
| 	fmla	z22.s, p1/m, z0.s, z14.s | ||||
| 	OP_ir	z23.s, p1/m, z1.s, z14.s | ||||
|     ld1rw  z14.s, p0/z,  [pB, 24] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
|     defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| 	#eor	z23.16b, z19.16b, z19.16b | ||||
| 	fmls	z23.s, p1/m, z0.s, z15.s | ||||
| #else | ||||
| 	fmla	z23.s, p1/m, z0.s, z15.s | ||||
| #endif | ||||
| 	OP_ii	z22.s, p1/m, z1.s, z15.s | ||||
|     ld1rw  z15.s, p0/z,  [pB, 28] | ||||
| 
 | ||||
|     add pB, pB, 32 | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64] | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_M1
 | ||||
| 	ld2w	{z2.s, z3.s}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #3	// pA = pA + lanes * 2 * 4 | ||||
| 
 | ||||
| 	OP_rr	z16.s, p1/m, z0.s, z8.s | ||||
| 	OP_ir	z17.s, p1/m, z1.s, z8.s | ||||
|     ld1rw  z8.s, p0/z,  [pB] | ||||
| 	OP_ii	z16.s, p1/m, z1.s, z9.s | ||||
| 	OP_ri	z17.s, p1/m, z0.s, z9.s | ||||
|     ld1rw  z9.s, p0/z,  [pB, 4] | ||||
| 
 | ||||
| 	OP_rr	z18.s, p1/m, z0.s, z10.s | ||||
| 	OP_ir	z19.s, p1/m, z1.s, z10.s | ||||
|     ld1rw  z10.s, p0/z,  [pB, 8] | ||||
| 	OP_ii	z18.s, p1/m, z1.s, z11.s | ||||
| 	OP_ri	z19.s, p1/m, z0.s, z11.s | ||||
|     ld1rw  z11.s, p0/z,  [pB, 12] | ||||
| 
 | ||||
| 	OP_rr	z20.s, p1/m, z0.s, z12.s | ||||
| 	OP_ir	z21.s, p1/m, z1.s, z12.s | ||||
|     ld1rw  z12.s, p0/z,  [pB, 16] | ||||
| 	OP_ii	z20.s, p1/m, z1.s, z13.s | ||||
| 	OP_ri	z21.s, p1/m, z0.s, z13.s | ||||
|     ld1rw  z13.s, p0/z,  [pB, 20] | ||||
| 
 | ||||
| 	OP_rr	z22.s, p1/m, z0.s, z14.s | ||||
| 	OP_ir	z23.s, p1/m, z1.s, z14.s | ||||
|     ld1rw  z14.s, p0/z,  [pB, 24] | ||||
| 	OP_ii	z22.s, p1/m, z1.s, z15.s | ||||
| 	OP_ri	z23.s, p1/m, z0.s, z15.s | ||||
|     ld1rw  z15.s, p0/z,  [pB, 28] | ||||
| 
 | ||||
|     add pB, pB, 32 | ||||
| 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE] | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64] | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_M2
 | ||||
| 	ld2w	{z0.s, z1.s}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #3	// pA = pA + lanes *2 * 4 | ||||
| 
 | ||||
| 	OP_rr	z16.s, p1/m, z2.s, z8.s | ||||
| 	OP_ir	z17.s, p1/m, z3.s, z8.s | ||||
|     ld1rw  z8.s, p0/z,  [pB] | ||||
| 	OP_ii	z16.s, p1/m, z3.s, z9.s | ||||
| 	OP_ri	z17.s, p1/m, z2.s, z9.s | ||||
|     ld1rw  z9.s, p0/z,  [pB, 4] | ||||
| 
 | ||||
| 	OP_rr	z18.s, p1/m, z2.s, z10.s | ||||
| 	OP_ir	z19.s, p1/m, z3.s, z10.s | ||||
|     ld1rw  z10.s, p0/z,  [pB, 8] | ||||
| 	OP_ii	z18.s, p1/m, z3.s, z11.s | ||||
| 	OP_ri	z19.s, p1/m, z2.s, z11.s | ||||
|     ld1rw  z11.s, p0/z,  [pB, 12] | ||||
| 
 | ||||
| 	OP_rr	z20.s, p1/m, z2.s, z12.s | ||||
| 	OP_ir	z21.s, p1/m, z3.s, z12.s | ||||
|     ld1rw  z12.s, p0/z,  [pB, 16] | ||||
| 	OP_ii	z20.s, p1/m, z3.s, z13.s | ||||
| 	OP_ri	z21.s, p1/m, z2.s, z13.s | ||||
|     ld1rw  z13.s, p0/z,  [pB, 20] | ||||
| 
 | ||||
| 	OP_rr	z22.s, p1/m, z2.s, z14.s | ||||
| 	OP_ir	z23.s, p1/m, z3.s, z14.s | ||||
|     ld1rw  z14.s, p0/z,  [pB, 24] | ||||
| 	OP_ii	z22.s, p1/m, z3.s, z15.s | ||||
| 	OP_ri	z23.s, p1/m, z2.s, z15.s | ||||
|     ld1rw  z15.s, p0/z,  [pB, 28] | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE] | ||||
| 
 | ||||
|     add pB, pB, 32 | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64] | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_E
 | ||||
| 	OP_rr	z16.s, p1/m, z2.s, z8.s | ||||
| 	OP_ir	z17.s, p1/m, z3.s, z8.s | ||||
| 	OP_ii	z16.s, p1/m, z3.s, z9.s | ||||
| 	OP_ri	z17.s, p1/m, z2.s, z9.s | ||||
| 
 | ||||
| 	OP_rr	z18.s, p1/m, z2.s, z10.s | ||||
| 	OP_ir	z19.s, p1/m, z3.s, z10.s | ||||
| 	OP_ii	z18.s, p1/m, z3.s, z11.s | ||||
| 	OP_ri	z19.s, p1/m, z2.s, z11.s | ||||
| 
 | ||||
| 	OP_rr	z20.s, p1/m, z2.s, z12.s | ||||
| 	OP_ir	z21.s, p1/m, z3.s, z12.s | ||||
| 	OP_ii	z20.s, p1/m, z3.s, z13.s | ||||
| 	OP_ri	z21.s, p1/m, z2.s, z13.s | ||||
| 
 | ||||
| 	OP_rr	z22.s, p1/m, z2.s, z14.s | ||||
| 	OP_ir	z23.s, p1/m, z3.s, z14.s | ||||
| 	OP_ii	z22.s, p1/m, z3.s, z15.s | ||||
| 	OP_ri	z23.s, p1/m, z2.s, z15.s | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE] | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64] | ||||
| 
 | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_SUB
 | ||||
| 	ld2w	{z0.s, z1.s}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4 | ||||
| 
 | ||||
|     ld1rw  z8.s, p0/z,  [pB] | ||||
|     ld1rw  z9.s, p0/z,  [pB, 4] | ||||
|     ld1rw  z10.s, p0/z,  [pB, 8] | ||||
|     ld1rw  z11.s, p0/z,  [pB, 12] | ||||
| 
 | ||||
| 	OP_rr	z16.s, p1/m, z0.s, z8.s | ||||
| 	OP_ir	z17.s, p1/m, z1.s, z8.s | ||||
| 	OP_ii	z16.s, p1/m, z1.s, z9.s | ||||
| 	OP_ri	z17.s, p1/m, z0.s, z9.s | ||||
| 
 | ||||
|     ld1rw  z12.s, p0/z,  [pB, 16] | ||||
|     ld1rw  z13.s, p0/z,  [pB, 20] | ||||
|     ld1rw  z14.s, p0/z,  [pB, 24] | ||||
|     ld1rw  z15.s, p0/z,  [pB, 28] | ||||
| 
 | ||||
| 	OP_rr	z18.s, p1/m, z0.s, z10.s | ||||
| 	OP_ir	z19.s, p1/m, z1.s, z10.s | ||||
| 	OP_ii	z18.s, p1/m, z1.s, z11.s | ||||
| 	OP_ri	z19.s, p1/m, z0.s, z11.s | ||||
| 
 | ||||
|     add pB, pB, 32 | ||||
| 
 | ||||
| 	OP_rr	z20.s, p1/m, z0.s, z12.s | ||||
| 	OP_ir	z21.s, p1/m, z1.s, z12.s | ||||
| 	OP_ii	z20.s, p1/m, z1.s, z13.s | ||||
| 	OP_ri	z21.s, p1/m, z0.s, z13.s | ||||
| 
 | ||||
| 	OP_rr	z22.s, p1/m, z0.s, z14.s | ||||
| 	OP_ir	z23.s, p1/m, z1.s, z14.s | ||||
| 	OP_ii	z22.s, p1/m, z1.s, z15.s | ||||
| 	OP_ri	z23.s, p1/m, z0.s, z15.s | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE] | ||||
| 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE] | ||||
| .endm | ||||
| 
 | ||||
| .macro SAVEv1x4
 | ||||
| 	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0] | ||||
| 	fmla	z24.s, p1/m, z16.s, alphaz_R | ||||
| 	fmls	z24.s, p1/m, z17.s, alphaz_I | ||||
| 	fmla	z25.s, p1/m, z16.s, alphaz_I | ||||
| 	fmla	z25.s, p1/m, z17.s, alphaz_R | ||||
| 	st2w 	{z24.s, z25.s}, p1, [pCRow0] | ||||
| 
 | ||||
| 	add	pCRow0, pCRow0, lanes, lsl #3 | ||||
| 
 | ||||
| 	ld2w	{z26.s, z27.s}, p1/z, [pCRow1] | ||||
| 	fmla	z26.s, p1/m, z18.s, alphaz_R | ||||
| 	fmls	z26.s, p1/m, z19.s, alphaz_I | ||||
| 	fmla	z27.s, p1/m, z18.s, alphaz_I | ||||
| 	fmla	z27.s, p1/m, z19.s, alphaz_R | ||||
| 	st2w 	{z26.s, z27.s}, p1, [pCRow1] | ||||
| 
 | ||||
| 	add	pCRow1, pCRow1, lanes, lsl #3 | ||||
| 	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	ld2w	{z28.s, z29.s}, p1/z, [pCRow2] | ||||
| 	fmla	z28.s, p1/m, z20.s, alphaz_R | ||||
| 	fmls	z28.s, p1/m, z21.s, alphaz_I | ||||
| 	fmla	z29.s, p1/m, z20.s, alphaz_I | ||||
| 	fmla	z29.s, p1/m, z21.s, alphaz_R | ||||
| 	st2w 	{z28.s, z29.s}, p1, [pCRow2] | ||||
| 
 | ||||
| 	add	pCRow2, pCRow2, lanes, lsl #3 | ||||
| 
 | ||||
| 	ld2w	{z30.s, z31.s}, p1/z, [pCRow3] | ||||
| 	fmla	z30.s, p1/m, z22.s, alphaz_R | ||||
| 	fmls	z30.s, p1/m, z23.s, alphaz_I | ||||
| 	fmla	z31.s, p1/m, z22.s, alphaz_I | ||||
| 	fmla	z31.s, p1/m, z23.s, alphaz_R | ||||
| 	st2w 	{z30.s, z31.s}, p1, [pCRow3] | ||||
| 
 | ||||
| 	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4 | ||||
| 
 | ||||
| 	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | ||||
| 
 | ||||
| .endm | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| 
 | ||||
| .macro INITv1x2
 | ||||
| 	dup		z16.s, #0 | ||||
| 	dup		z17.s, #0 | ||||
| 	dup		z18.s, #0 | ||||
| 	dup		z19.s, #0 | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x2_SUB
 | ||||
| 	ld2w	{z0.s, z1.s}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4 | ||||
| 
 | ||||
|     ld1rw  z8.s, p0/z,  [pB] | ||||
|     ld1rw  z9.s, p0/z,  [pB, 4] | ||||
|     ld1rw  z10.s, p0/z,  [pB, 8] | ||||
|     ld1rw  z11.s, p0/z,  [pB, 12] | ||||
| 
 | ||||
| 	OP_rr	z16.s, p1/m, z0.s, z8.s | ||||
| 	OP_ir	z17.s, p1/m, z1.s, z8.s | ||||
| 	OP_ii	z16.s, p1/m, z1.s, z9.s | ||||
| 	OP_ri	z17.s, p1/m, z0.s, z9.s | ||||
| 
 | ||||
| 	OP_rr	z18.s, p1/m, z0.s, z10.s | ||||
| 	OP_ir	z19.s, p1/m, z1.s, z10.s | ||||
| 	OP_ii	z18.s, p1/m, z1.s, z11.s | ||||
| 	OP_ri	z19.s, p1/m, z0.s, z11.s | ||||
| 
 | ||||
|     add pB, pB, 16 | ||||
| .endm | ||||
| 
 | ||||
| .macro SAVEv1x2
 | ||||
| 	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0] | ||||
| 	fmla	z24.s, p1/m, z16.s, alphaz_R | ||||
| 	fmls	z24.s, p1/m, z17.s, alphaz_I | ||||
| 	fmla	z25.s, p1/m, z16.s, alphaz_I | ||||
| 	fmla	z25.s, p1/m, z17.s, alphaz_R | ||||
| 	st2w 	{z24.s, z25.s}, p1, [pCRow0] | ||||
| 
 | ||||
| 	add	pCRow0, pCRow0, lanes, lsl #3 | ||||
| 
 | ||||
| 	ld2w	{z26.s, z27.s}, p1/z, [pCRow1] | ||||
| 	fmla	z26.s, p1/m, z18.s, alphaz_R | ||||
| 	fmls	z26.s, p1/m, z19.s, alphaz_I | ||||
| 	fmla	z27.s, p1/m, z18.s, alphaz_I | ||||
| 	fmla	z27.s, p1/m, z19.s, alphaz_R | ||||
| 	st2w 	{z26.s, z27.s}, p1, [pCRow1] | ||||
| 
 | ||||
| 	add	pCRow1, pCRow1, lanes, lsl #3 | ||||
| 	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | ||||
| 
 | ||||
| .endm | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| 
 | ||||
| .macro INITv1x1
 | ||||
| 	dup		z16.s, #0 | ||||
| 	dup		z17.s, #0 | ||||
| .endm | ||||
| 
 | ||||
| 
 | ||||
| .macro KERNELv1x1_SUB
 | ||||
| 	ld2w	{z0.s, z1.s}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4 | ||||
| 
 | ||||
|     ld1rw  z8.s, p0/z,  [pB] | ||||
|     ld1rw  z9.s, p0/z,  [pB, 4] | ||||
| 
 | ||||
|     add pB, pB, 8 | ||||
| 
 | ||||
| 	OP_rr	z16.s, p1/m, z0.s, z8.s | ||||
| 	OP_ir	z17.s, p1/m, z1.s, z8.s | ||||
| 	OP_ii	z16.s, p1/m, z1.s, z9.s | ||||
| 	OP_ri	z17.s, p1/m, z0.s, z9.s | ||||
| .endm | ||||
| 
 | ||||
| .macro SAVEv1x1
 | ||||
| 	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0] | ||||
| 	fmla	z24.s, p1/m, z16.s, alphaz_R | ||||
| 	fmls	z24.s, p1/m, z17.s, alphaz_I | ||||
| 	fmla	z25.s, p1/m, z16.s, alphaz_I | ||||
| 	fmla	z25.s, p1/m, z17.s, alphaz_R | ||||
| 	st2w 	{z24.s, z25.s}, p1, [pCRow0] | ||||
| 
 | ||||
| 	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *4 | ||||
| 
 | ||||
| 	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | ||||
| 
 | ||||
| .endm | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| /******************************************************************************* | ||||
| * End of macro definitions | ||||
| *******************************************************************************/ | ||||
| 
 | ||||
| 	PROLOGUE | ||||
| 
 | ||||
| 	.align 5
 | ||||
| 	add	sp, sp, #-(11 * 16) | ||||
| 	stp	d8, d9, [sp, #(0 * 16)] | ||||
| 	stp	d10, d11, [sp, #(1 * 16)] | ||||
| 	stp	d12, d13, [sp, #(2 * 16)] | ||||
| 	stp	d14, d15, [sp, #(3 * 16)] | ||||
| 	stp	d16, d17, [sp, #(4 * 16)] | ||||
| 	stp	x18, x19, [sp, #(5 * 16)] | ||||
| 	stp	x20, x21, [sp, #(6 * 16)] | ||||
| 	stp	x22, x23, [sp, #(7 * 16)] | ||||
| 	stp	x24, x25, [sp, #(8 * 16)] | ||||
| 	stp	x26, x27, [sp, #(9 * 16)] | ||||
| 	str	x28, [sp, #(10 * 16)] | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [origPB] | ||||
| 	prfm	PLDL1KEEP, [origPA] | ||||
| 
 | ||||
| 	fmov	alphaR, s0 | ||||
| 	dup	    alphaz_R, alphaR | ||||
| 	fmov	alphaI, s1 | ||||
| 	dup	    alphaz_I, alphaI | ||||
| 
 | ||||
| 	lsl	LDC, LDC, #3			// ldc = ldc * 2 * 4 | ||||
|     ptrue p0.s                  // create true predicate  | ||||
| 
 | ||||
| 	mov	pB, origPB | ||||
| 
 | ||||
| // Loop over N | ||||
| 	mov	counterJ, origN | ||||
| 	asr 	counterJ, counterJ, #2		// J = J / 4 | ||||
| 	cmp 	counterJ, #0 | ||||
| 	ble	.Lcgemm_kernel_L2_BEGIN | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| .Lcgemm_kernel_L4_BEGIN: | ||||
| 	mov	pCRow0, pC | ||||
| 	add	pCRow1, pCRow0, LDC | ||||
| 	add	pCRow2, pCRow1, LDC | ||||
| 	add	pCRow3, pCRow2, LDC | ||||
| 
 | ||||
| 	add	pC, pCRow3, LDC | ||||
| 
 | ||||
| 	mov	pA, origPA			// pA = start of A array | ||||
| 
 | ||||
| .Lcgemm_kernel_L4_Mv1_BEGIN: | ||||
| 
 | ||||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | ||||
|     mov counterI, #0 | ||||
|     whilelt p1.s, counterI, origM    | ||||
|     cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lcgemm_kernel_L4_Mv1_20: | ||||
| 
 | ||||
| 	mov	pB, origPB | ||||
|     INITv1x4                     // fill with zeros | ||||
| 
 | ||||
| 	asr 	counterL , origK, #3 | ||||
| 	cmp	counterL , #2 | ||||
| 	blt	.Lcgemm_kernel_L4_Mv1_32 | ||||
| 
 | ||||
| 	KERNELv1x4_I | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 
 | ||||
| 	subs	counterL, counterL, #2		// subtract 2 | ||||
| 	ble	.Lcgemm_kernel_L4_Mv1_22a | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lcgemm_kernel_L4_Mv1_22: | ||||
| 
 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lcgemm_kernel_L4_Mv1_22 | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lcgemm_kernel_L4_Mv1_22a: | ||||
| 
 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_E | ||||
| 
 | ||||
| 	b	 .Lcgemm_kernel_L4_Mv1_44 | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lcgemm_kernel_L4_Mv1_32: | ||||
| 
 | ||||
| 	tst	counterL, #1 | ||||
| 	ble	.Lcgemm_kernel_L4_Mv1_40 | ||||
| 
 | ||||
| 	KERNELv1x4_I | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_E | ||||
| 
 | ||||
| 	b	.Lcgemm_kernel_L4_Mv1_44 | ||||
| 
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L4_Mv1_40: | ||||
| 
 | ||||
| 	INITv1x4 | ||||
| 
 | ||||
| .Lcgemm_kernel_L4_Mv1_44: | ||||
| 
 | ||||
| 	ands	counterL , origK, #7 | ||||
| 	ble	.Lcgemm_kernel_L4_Mv1_100 | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lcgemm_kernel_L4_Mv1_46: | ||||
| 	KERNELv1x4_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bne	.Lcgemm_kernel_L4_Mv1_46 | ||||
| 
 | ||||
| .Lcgemm_kernel_L4_Mv1_100: | ||||
| 	prfm	PLDL1KEEP, [pA] | ||||
| 	prfm	PLDL1KEEP, [pA, #64] | ||||
| 	prfm	PLDL1KEEP, [origPB] | ||||
| 
 | ||||
| 	SAVEv1x4 | ||||
| 
 | ||||
| .Lcgemm_kernel_L4_Mv1_END: | ||||
| 
 | ||||
|     incw    counterI | ||||
|     whilelt p1.s, counterI, origM             //SVE instruction | ||||
|     cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension | ||||
|     b.any   .Lcgemm_kernel_L4_Mv1_20    | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L4_END: | ||||
| 
 | ||||
| 	lsl	temp, origK, #5 | ||||
| 	add	origPB, origPB, temp		// B = B + K * 4 * 4 * 2 | ||||
| 
 | ||||
| 	subs	counterJ, counterJ , #1		// j-- | ||||
| 	bgt	.Lcgemm_kernel_L4_BEGIN | ||||
| 
 | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| .Lcgemm_kernel_L2_BEGIN:   // less than 2 left in N direction | ||||
| 
 | ||||
| 	mov	counterJ , origN | ||||
| 	tst	counterJ , #3 | ||||
| 	ble	.Lcgemm_kernel_L999 | ||||
| 
 | ||||
| 	tst	counterJ , #2 | ||||
| 	ble	.Lcgemm_kernel_L1_BEGIN | ||||
| 
 | ||||
| 	mov	pCRow0, pC			// pCRow0 = pC | ||||
| 	add	pCRow1, pCRow0, LDC | ||||
| 
 | ||||
| 	add	pC,pC,LDC, lsl #1 | ||||
| 
 | ||||
| 	mov	pA, origPA			// pA = A | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L2_Mv1_BEGIN: | ||||
| 
 | ||||
|     mov counterI, #0 | ||||
|     whilelt p1.s, counterI, origM               //SVE instruction | ||||
|     cntp lanes, p0, p1.s | ||||
| 
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L2_Mv1_20: | ||||
| 
 | ||||
| 	INITv1x2 | ||||
| 
 | ||||
| 	mov	pB, origPB | ||||
| 	asr	counterL , origK, #3		// counterL = counterL / 8 | ||||
| 	cmp	counterL,#0 | ||||
| 	ble	.Lcgemm_kernel_L2_Mv1_40 | ||||
| 	.align 5
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L2_Mv1_22: | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 
 | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lcgemm_kernel_L2_Mv1_22 | ||||
| 
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L2_Mv1_40: | ||||
| 
 | ||||
| 	ands	counterL , origK, #7		// counterL = counterL % 8 | ||||
| 	ble	.Lcgemm_kernel_L2_Mv1_100 | ||||
| 
 | ||||
| .Lcgemm_kernel_L2_Mv1_42: | ||||
| 
 | ||||
| 	KERNELv1x2_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lcgemm_kernel_L2_Mv1_42 | ||||
| 
 | ||||
| .Lcgemm_kernel_L2_Mv1_100: | ||||
| 
 | ||||
| 	SAVEv1x2 | ||||
| 
 | ||||
| .Lcgemm_kernel_L2_Mv1_END: | ||||
| 
 | ||||
| 
 | ||||
|     incw    counterI | ||||
|     whilelt p1.s, counterI, origM             //SVE instruction | ||||
|     cntp lanes, p0, p1.s | ||||
|     b.any   .Lcgemm_kernel_L2_Mv1_20    | ||||
| 
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L2_END: | ||||
| 	lsl	temp, origK, #4 | ||||
| 	add	origPB, origPB, temp // B = B + K * 2 * 4 * 2 | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| .Lcgemm_kernel_L1_BEGIN: | ||||
| 
 | ||||
| 	mov	counterJ , origN | ||||
| 	tst	counterJ , #1 | ||||
| 	ble	.Lcgemm_kernel_L999 // done | ||||
| 
 | ||||
| 
 | ||||
| 	mov	pCRow0, pC			// pCRow0 = C | ||||
| 	add	pC , pC , LDC			// Update pC to point to next | ||||
| 
 | ||||
| 	mov	pA, origPA			// pA = A | ||||
| 
 | ||||
| .Lcgemm_kernel_L1_Mv1_BEGIN: | ||||
| 
 | ||||
|     mov counterI, #0 | ||||
|     whilelt p1.s, counterI, origM               //SVE instruction | ||||
|     cntp lanes, p0, p1.s | ||||
| 
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L1_Mv1_20: | ||||
| 
 | ||||
| 	INITv1x1 | ||||
| 
 | ||||
| 	mov	pB, origPB | ||||
| 	asr	counterL , origK, #3		// counterL = counterL / 8 | ||||
| 	cmp	counterL , #0 | ||||
| 	ble	.Lcgemm_kernel_L1_Mv1_40 | ||||
| 	.align 5
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L1_Mv1_22: | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 
 | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lcgemm_kernel_L1_Mv1_22 | ||||
| 
 | ||||
| 
 | ||||
| .Lcgemm_kernel_L1_Mv1_40: | ||||
| 
 | ||||
| 	ands	counterL , origK, #7		// counterL = counterL % 8 | ||||
| 	ble	.Lcgemm_kernel_L1_Mv1_100 | ||||
| 
 | ||||
| .Lcgemm_kernel_L1_Mv1_42: | ||||
| 
 | ||||
| 	KERNELv1x1_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lcgemm_kernel_L1_Mv1_42 | ||||
| 
 | ||||
| .Lcgemm_kernel_L1_Mv1_100: | ||||
| 
 | ||||
| 	SAVEv1x1 | ||||
| 
 | ||||
| .Lcgemm_kernel_L1_Mv1_END: | ||||
| 
 | ||||
|     incw    counterI | ||||
|     whilelt p1.s, counterI, origM             //SVE instruction | ||||
|     cntp lanes, p0, p1.s | ||||
|     b.any   .Lcgemm_kernel_L1_Mv1_20    | ||||
| 
 | ||||
| .Lcgemm_kernel_L1_END: | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| .Lcgemm_kernel_L999: | ||||
| 	mov	x0, #0				// set return value | ||||
| 	ldp	d8, d9, [sp, #(0 * 16)] | ||||
| 	ldp	d10, d11, [sp, #(1 * 16)] | ||||
| 	ldp	d12, d13, [sp, #(2 * 16)] | ||||
| 	ldp	d14, d15, [sp, #(3 * 16)] | ||||
| 	ldp	d16, d17, [sp, #(4 * 16)] | ||||
| 	ldp	x18, x19, [sp, #(5 * 16)] | ||||
| 	ldp	x20, x21, [sp, #(6 * 16)] | ||||
| 	ldp	x22, x23, [sp, #(7 * 16)] | ||||
| 	ldp	x24, x25, [sp, #(8 * 16)] | ||||
| 	ldp	x26, x27, [sp, #(9 * 16)] | ||||
| 	ldr	x28, [sp, #(10 * 16)] | ||||
| 	add	sp, sp, #(11*16) | ||||
| 	ret | ||||
| 
 | ||||
| 	EPILOGUE | ||||
| 
 | ||||
|  | @ -0,0 +1,79 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include <arm_sve.h> | ||||
| 
 | ||||
| // TODO: write in assembly with proper unrolling of inner loop
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | ||||
| 
 | ||||
|     BLASLONG j; | ||||
|     IFLOAT *aoffset, *aoffset1, *boffset; | ||||
| 
 | ||||
|     svint32_t lda_vec = svindex_s32(0, lda * 2); | ||||
| 
 | ||||
|     aoffset = a; | ||||
|     boffset = b; | ||||
| 
 | ||||
|     j = 0; | ||||
|     svbool_t pg = svwhilelt_b32(j, n); | ||||
|     uint32_t active = svcntp_b32(svptrue_b32(), pg); | ||||
|     do { | ||||
| 
 | ||||
|         aoffset1 = aoffset; | ||||
| 
 | ||||
|         uint32_t i_cnt = m; | ||||
|         while (i_cnt--) { | ||||
|             svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec); | ||||
|             svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec); | ||||
|             svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag)); | ||||
|             aoffset1 += 2; | ||||
|             boffset += active * 2; | ||||
|         } | ||||
|         aoffset += active * lda * 2; | ||||
| 
 | ||||
|         j += svcntw(); | ||||
|         pg = svwhilelt_b32(j, n); | ||||
|         active = svcntp_b32(svptrue_b32(), pg); | ||||
| 
 | ||||
| 
 | ||||
|     } while (svptest_any(svptrue_b32(), pg)); | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
|  | @ -0,0 +1,75 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include <arm_sve.h> | ||||
| 
 | ||||
| // TODO: write in assembly with proper unrolling of inner loop
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | ||||
| 
 | ||||
|     BLASLONG j; | ||||
|     IFLOAT *aoffset, *aoffset1, *boffset; | ||||
| 
 | ||||
|     aoffset = a; | ||||
|     boffset = b; | ||||
| 
 | ||||
|     j = 0; | ||||
|     svbool_t pg = svwhilelt_b32(j, n); | ||||
|     uint32_t active = svcntp_b32(svptrue_b32(), pg); | ||||
|     do { | ||||
| 
 | ||||
|         aoffset1 = aoffset; | ||||
| 
 | ||||
|         uint32_t i_cnt = m; | ||||
|         while (i_cnt--) { | ||||
|             svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1); | ||||
|             svst2_f32(pg, (float *) boffset, a_vec); | ||||
|             aoffset1 += lda * 2; | ||||
|             boffset += active * 2; | ||||
|         } | ||||
|         aoffset += active * 2; | ||||
| 
 | ||||
|         j += svcntw(); | ||||
|         pg = svwhilelt_b32(j, n); | ||||
|         active = svcntp_b32(svptrue_b32(), pg); | ||||
| 
 | ||||
|     } while (svptest_any(svptrue_b32(), pg)); | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -0,0 +1,320 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| static FLOAT dm1 = -1.; | ||||
| 
 | ||||
| #ifdef CONJ | ||||
| #define GEMM_KERNEL   GEMM_KERNEL_L | ||||
| #else | ||||
| #define GEMM_KERNEL   GEMM_KERNEL_N | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 1 | ||||
| #define GEMM_UNROLL_N_SHIFT 0 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 2 | ||||
| #define GEMM_UNROLL_N_SHIFT 1 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 4 | ||||
| #define GEMM_UNROLL_N_SHIFT 2 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 8 | ||||
| #define GEMM_UNROLL_N_SHIFT 3 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 16 | ||||
| #define GEMM_UNROLL_N_SHIFT 4 | ||||
| #endif | ||||
| 
 | ||||
| #ifndef COMPLEX | ||||
| 
 | ||||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | ||||
| 
 | ||||
|   FLOAT aa,  bb; | ||||
| 
 | ||||
|   int i, j, k; | ||||
| 
 | ||||
|   a += (m - 1) * m; | ||||
|   b += (m - 1) * n; | ||||
| 
 | ||||
|   for (i = m - 1; i >= 0; i--) { | ||||
| 
 | ||||
|     aa = *(a + i); | ||||
| 
 | ||||
|     for (j = 0; j < n; j ++) { | ||||
|       bb = *(c + i + j * ldc); | ||||
|       bb *= aa; | ||||
|       *b             = bb; | ||||
|       *(c + i + j * ldc) = bb; | ||||
|       b ++; | ||||
| 
 | ||||
|       for (k = 0; k < i; k ++){ | ||||
|         *(c + k + j * ldc) -= bb * *(a + k); | ||||
|       } | ||||
| 
 | ||||
|     } | ||||
|     a -= m; | ||||
|     b -= 2 * n; | ||||
|   } | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | ||||
| 
 | ||||
|   FLOAT aa1, aa2; | ||||
|   FLOAT bb1, bb2; | ||||
|   FLOAT cc1, cc2; | ||||
| 
 | ||||
|   int i, j, k; | ||||
| 
 | ||||
|   ldc *= 2; | ||||
|   a += (m - 1) * m * 2; | ||||
|   b += (m - 1) * n * 2; | ||||
| 
 | ||||
|   for (i = m - 1; i >= 0; i--) { | ||||
| 
 | ||||
|     aa1 = *(a + i * 2 + 0); | ||||
|     aa2 = *(a + i * 2 + 1); | ||||
| 
 | ||||
|     for (j = 0; j < n; j ++) { | ||||
|       bb1 = *(c + i * 2 + 0 + j * ldc); | ||||
|       bb2 = *(c + i * 2 + 1 + j * ldc); | ||||
| 
 | ||||
| #ifndef CONJ | ||||
|       cc1 = aa1 * bb1 - aa2 * bb2; | ||||
|       cc2 = aa1 * bb2 + aa2 * bb1; | ||||
| #else | ||||
|       cc1 = aa1 * bb1 + aa2 * bb2; | ||||
|       cc2 = aa1 * bb2 - aa2 * bb1; | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
|       *(b + 0) = cc1; | ||||
|       *(b + 1) = cc2; | ||||
|       *(c + i * 2 + 0 + j * ldc) = cc1; | ||||
|       *(c + i * 2 + 1 + j * ldc) = cc2; | ||||
|       b += 2; | ||||
| 
 | ||||
|       for (k = 0; k < i; k ++){ | ||||
| #ifndef CONJ | ||||
|         *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); | ||||
|         *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | ||||
| #else | ||||
|         *(c + k * 2 + 0 + j * ldc) -=   cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); | ||||
|         *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | ||||
| #endif | ||||
|       } | ||||
| 
 | ||||
|     } | ||||
|     a -= m * 2; | ||||
|     b -= 4 * n; | ||||
|   } | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1, | ||||
| #ifdef COMPLEX | ||||
|     FLOAT dummy2, | ||||
| #endif | ||||
|     FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | ||||
| 
 | ||||
|   BLASLONG i, j; | ||||
|   FLOAT *aa, *cc; | ||||
|   BLASLONG  kk; | ||||
| #ifdef DOUBLE | ||||
|   int sve_size = svcntd(); | ||||
| #else | ||||
|   int sve_size = svcntw(); | ||||
| #endif | ||||
| 
 | ||||
| #if 0 | ||||
|   fprintf(stderr, "TRSM KERNEL LN : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n", | ||||
|       m, n, k, offset); | ||||
| #endif | ||||
| 
 | ||||
|   j = (n >> GEMM_UNROLL_N_SHIFT); | ||||
| 
 | ||||
|   while (j > 0) { | ||||
| 
 | ||||
|     kk = m + offset; | ||||
| 
 | ||||
|     i = m % sve_size; | ||||
|     if (i) { | ||||
|       aa = a + (m - i) * k * COMPSIZE; | ||||
|       cc = c + (m - i)     * COMPSIZE; | ||||
| 
 | ||||
|       if (k - kk > 0) { | ||||
|         GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|             ZERO, | ||||
| #endif | ||||
|             aa + i             * kk * COMPSIZE, | ||||
|             b  + GEMM_UNROLL_N * kk * COMPSIZE, | ||||
|             cc, | ||||
|             ldc); | ||||
|       } | ||||
| 
 | ||||
|       solve(i, GEMM_UNROLL_N, | ||||
|           aa + (kk - i) * i             * COMPSIZE, | ||||
|           b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE, | ||||
|           cc, ldc); | ||||
| 
 | ||||
|       kk -= i; | ||||
| 
 | ||||
|     } | ||||
| 
 | ||||
|     int mod = i; | ||||
|     i = sve_size; | ||||
|     if (i <= m) { | ||||
|       aa = a + (m - mod - sve_size) * k * COMPSIZE; | ||||
|       cc = c + (m - mod - sve_size)     * COMPSIZE; | ||||
| 
 | ||||
|       do { | ||||
|         if (k - kk > 0) { | ||||
|           GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|               ZERO, | ||||
| #endif | ||||
|               aa + sve_size * kk * COMPSIZE, | ||||
|               b +  GEMM_UNROLL_N * kk * COMPSIZE, | ||||
|               cc, | ||||
|               ldc); | ||||
|         } | ||||
| 
 | ||||
|         solve(sve_size, GEMM_UNROLL_N, | ||||
|             aa + (kk - sve_size) * sve_size * COMPSIZE, | ||||
|             b  + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE, | ||||
|             cc, ldc); | ||||
| 
 | ||||
|         aa -= sve_size * k * COMPSIZE; | ||||
|         cc -= sve_size     * COMPSIZE; | ||||
|         kk -= sve_size; | ||||
| 
 | ||||
|         i += sve_size; | ||||
|       } while (i <= m); | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
|     b += GEMM_UNROLL_N * k * COMPSIZE; | ||||
|     c += GEMM_UNROLL_N * ldc * COMPSIZE; | ||||
|     j --; | ||||
|   } | ||||
| 
 | ||||
|   if (n & (GEMM_UNROLL_N - 1)) { | ||||
| 
 | ||||
|     j = (GEMM_UNROLL_N >> 1); | ||||
|     while (j > 0) { | ||||
|       if (n & j) { | ||||
| 
 | ||||
|         kk = m + offset; | ||||
| 
 | ||||
|         i = m % sve_size; | ||||
|         if (i) { | ||||
|           aa = a + (m - i) * k * COMPSIZE; | ||||
|           cc = c + (m - i)     * COMPSIZE; | ||||
| 
 | ||||
|           if (k - kk > 0) { | ||||
|             GEMM_KERNEL(i, j, k - kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|                 ZERO, | ||||
| #endif | ||||
|                 aa + i * kk * COMPSIZE, | ||||
|                 b  + j * kk * COMPSIZE, | ||||
|                 cc, ldc); | ||||
|           } | ||||
| 
 | ||||
|           solve(i, j, | ||||
|               aa + (kk - i) * i * COMPSIZE, | ||||
|               b  + (kk - i) * j * COMPSIZE, | ||||
|               cc, ldc); | ||||
| 
 | ||||
|           kk -= i; | ||||
| 
 | ||||
|         } | ||||
| 
 | ||||
|         int mod = i; | ||||
|         i = sve_size; | ||||
|         if (i <= m) { | ||||
|           aa = a + (m - mod - sve_size) * k * COMPSIZE; | ||||
|           cc = c + (m - mod - sve_size)     * COMPSIZE; | ||||
| 
 | ||||
|           do { | ||||
|             if (k - kk > 0) { | ||||
|               GEMM_KERNEL(sve_size, j, k - kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|                   ZERO, | ||||
| #endif | ||||
|                   aa + sve_size * kk * COMPSIZE, | ||||
|                   b +  j             * kk * COMPSIZE, | ||||
|                   cc, | ||||
|                   ldc); | ||||
|             } | ||||
| 
 | ||||
|             solve(sve_size, j, | ||||
|                 aa + (kk - sve_size) * sve_size * COMPSIZE, | ||||
|                 b  + (kk - sve_size) * j             * COMPSIZE, | ||||
|                 cc, ldc); | ||||
| 
 | ||||
|             aa -= sve_size * k * COMPSIZE; | ||||
|             cc -= sve_size     * COMPSIZE; | ||||
|             kk -= sve_size; | ||||
| 
 | ||||
|             i += sve_size; | ||||
|           } while (i <= m); | ||||
|         } | ||||
| 
 | ||||
|         b += j * k   * COMPSIZE; | ||||
|         c += j * ldc * COMPSIZE; | ||||
|       } | ||||
|       j >>= 1; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
|  | @ -0,0 +1,295 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| static FLOAT dm1 = -1.; | ||||
| 
 | ||||
| #ifdef CONJ | ||||
| #define GEMM_KERNEL   GEMM_KERNEL_L | ||||
| #else | ||||
| #define GEMM_KERNEL   GEMM_KERNEL_N | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 1 | ||||
| #define GEMM_UNROLL_N_SHIFT 0 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 2 | ||||
| #define GEMM_UNROLL_N_SHIFT 1 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 4 | ||||
| #define GEMM_UNROLL_N_SHIFT 2 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 8 | ||||
| #define GEMM_UNROLL_N_SHIFT 3 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 16 | ||||
| #define GEMM_UNROLL_N_SHIFT 4 | ||||
| #endif | ||||
| 
 | ||||
| #ifndef COMPLEX | ||||
| 
 | ||||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | ||||
| 
 | ||||
|   FLOAT aa, bb; | ||||
| 
 | ||||
|   int i, j, k; | ||||
| 
 | ||||
|   for (i = 0; i < m; i++) { | ||||
| 
 | ||||
|     aa = *(a + i); | ||||
| 
 | ||||
|     for (j = 0; j < n; j ++) { | ||||
|       bb = *(c + i + j * ldc); | ||||
|       bb *= aa; | ||||
|       *b             = bb; | ||||
|       *(c + i + j * ldc) = bb; | ||||
|       b ++; | ||||
| 
 | ||||
|       for (k = i + 1; k < m; k ++){ | ||||
| 	*(c + k + j * ldc) -= bb * *(a + k); | ||||
|       } | ||||
| 
 | ||||
|     } | ||||
|     a += m; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | ||||
| 
 | ||||
|   FLOAT aa1, aa2; | ||||
|   FLOAT bb1, bb2; | ||||
|   FLOAT cc1, cc2; | ||||
| 
 | ||||
|   int i, j, k; | ||||
| 
 | ||||
|   ldc *= 2; | ||||
| 
 | ||||
|   for (i = 0; i < m; i++) { | ||||
| 
 | ||||
|     aa1 = *(a + i * 2 + 0); | ||||
|     aa2 = *(a + i * 2 + 1); | ||||
| 
 | ||||
|     for (j = 0; j < n; j ++) { | ||||
|       bb1 = *(c + i * 2 + 0 + j * ldc); | ||||
|       bb2 = *(c + i * 2 + 1 + j * ldc); | ||||
| 
 | ||||
| #ifndef CONJ | ||||
|       cc1 = aa1 * bb1 - aa2 * bb2; | ||||
|       cc2 = aa1 * bb2 + aa2 * bb1; | ||||
| #else | ||||
|       cc1 = aa1 * bb1 + aa2 * bb2; | ||||
|       cc2 = aa1 * bb2 - aa2 * bb1; | ||||
| #endif | ||||
| 
 | ||||
|       *(b + 0) = cc1; | ||||
|       *(b + 1) = cc2; | ||||
|       *(c + i * 2 + 0 + j * ldc) = cc1; | ||||
|       *(c + i * 2 + 1 + j * ldc) = cc2; | ||||
|       b += 2; | ||||
| 
 | ||||
|       for (k = i + 1; k < m; k ++){ | ||||
| #ifndef CONJ | ||||
| 	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); | ||||
| 	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | ||||
| #else | ||||
| 	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); | ||||
| 	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | ||||
| #endif | ||||
|       } | ||||
| 
 | ||||
|     } | ||||
|     a += m * 2; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | ||||
| #ifdef COMPLEX | ||||
| 	   FLOAT dummy2, | ||||
| #endif | ||||
| 	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | ||||
| 
 | ||||
|   FLOAT *aa, *cc; | ||||
|   BLASLONG  kk; | ||||
|   BLASLONG i, j, jj; | ||||
| #ifdef DOUBLE | ||||
|   int sve_size = svcntd(); | ||||
| #else | ||||
|   int sve_size = svcntw(); | ||||
| #endif | ||||
| 
 | ||||
| #if 0 | ||||
|   fprintf(stderr, "TRSM KERNEL LT : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n", | ||||
| 	  m, n, k, offset); | ||||
| #endif | ||||
| 
 | ||||
|   jj = 0; | ||||
| 
 | ||||
|   j = (n >> GEMM_UNROLL_N_SHIFT); | ||||
| 
 | ||||
|   while (j > 0) { | ||||
| 
 | ||||
|     kk = offset; | ||||
|     aa = a; | ||||
|     cc = c; | ||||
| 
 | ||||
|     i = sve_size; | ||||
| 
 | ||||
|     while (i <= m) { | ||||
| 
 | ||||
|       if (kk > 0) { | ||||
|         GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|             ZERO, | ||||
| #endif | ||||
|             aa, b, cc, ldc); | ||||
|       } | ||||
| 
 | ||||
|       solve(sve_size, GEMM_UNROLL_N, | ||||
|           aa + kk * sve_size * COMPSIZE, | ||||
|           b  + kk * GEMM_UNROLL_N * COMPSIZE, | ||||
|           cc, ldc); | ||||
| 
 | ||||
|       aa += sve_size * k * COMPSIZE; | ||||
|       cc += sve_size     * COMPSIZE; | ||||
|       kk += sve_size; | ||||
|       i += sve_size; | ||||
|     } | ||||
| 
 | ||||
|     i = m % sve_size; | ||||
|     if (i) { | ||||
|       if (kk > 0) { | ||||
|         GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|             ZERO, | ||||
| #endif | ||||
|             aa, b, cc, ldc); | ||||
|       } | ||||
|       solve(i, GEMM_UNROLL_N, | ||||
|           aa + kk * i             * COMPSIZE, | ||||
|           b  + kk * GEMM_UNROLL_N * COMPSIZE, | ||||
|           cc, ldc); | ||||
| 
 | ||||
|       aa += i * k * COMPSIZE; | ||||
|       cc += i     * COMPSIZE; | ||||
|       kk += i; | ||||
| 
 | ||||
|     } | ||||
| 
 | ||||
|     b += GEMM_UNROLL_N * k   * COMPSIZE; | ||||
|     c += GEMM_UNROLL_N * ldc * COMPSIZE; | ||||
|     j --; | ||||
|     jj += sve_size; | ||||
|   } | ||||
| 
 | ||||
|   if (n & (GEMM_UNROLL_N - 1)) { | ||||
| 
 | ||||
|     j = (GEMM_UNROLL_N >> 1); | ||||
|     while (j > 0) { | ||||
|       if (n & j) { | ||||
| 
 | ||||
|         kk = offset; | ||||
|         aa = a; | ||||
|         cc = c; | ||||
| 
 | ||||
|         i = sve_size; | ||||
| 
 | ||||
|         while (i <= m) { | ||||
|           if (kk > 0) { | ||||
|             GEMM_KERNEL(sve_size, j, kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|                 ZERO, | ||||
| #endif | ||||
|                 aa, | ||||
|                 b, | ||||
|                 cc, | ||||
|                 ldc); | ||||
|           } | ||||
| 
 | ||||
|           solve(sve_size, j, | ||||
|               aa + kk * sve_size * COMPSIZE, | ||||
|               b  + kk * j             * COMPSIZE, cc, ldc); | ||||
| 
 | ||||
|           aa += sve_size * k * COMPSIZE; | ||||
|           cc += sve_size     * COMPSIZE; | ||||
|           kk += sve_size; | ||||
|           i += sve_size; | ||||
|         } | ||||
| 
 | ||||
|         i = m % sve_size; | ||||
|         if (i) { | ||||
|           if (kk > 0) { | ||||
|             GEMM_KERNEL(i, j, kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|                 ZERO, | ||||
| #endif | ||||
|                 aa, | ||||
|                 b, | ||||
|                 cc, | ||||
|                 ldc); | ||||
|           } | ||||
| 
 | ||||
|           solve(i, j, | ||||
|               aa + kk * i * COMPSIZE, | ||||
|               b  + kk * j * COMPSIZE, cc, ldc); | ||||
| 
 | ||||
|           aa += i * k * COMPSIZE; | ||||
|           cc += i     * COMPSIZE; | ||||
|           kk += i; | ||||
| 
 | ||||
|         } | ||||
| 
 | ||||
|         b += j * k   * COMPSIZE; | ||||
|         c += j * ldc * COMPSIZE; | ||||
|       } | ||||
|       j >>= 1; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
|  | @ -0,0 +1,293 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| static FLOAT dm1 = -1.; | ||||
| 
 | ||||
| #ifdef CONJ | ||||
| #define GEMM_KERNEL   GEMM_KERNEL_R | ||||
| #else | ||||
| #define GEMM_KERNEL   GEMM_KERNEL_N | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 1 | ||||
| #define GEMM_UNROLL_N_SHIFT 0 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 2 | ||||
| #define GEMM_UNROLL_N_SHIFT 1 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 4 | ||||
| #define GEMM_UNROLL_N_SHIFT 2 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 8 | ||||
| #define GEMM_UNROLL_N_SHIFT 3 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 16 | ||||
| #define GEMM_UNROLL_N_SHIFT 4 | ||||
| #endif | ||||
| 
 | ||||
| #ifndef COMPLEX | ||||
| 
 | ||||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | ||||
| 
 | ||||
|   FLOAT aa, bb; | ||||
| 
 | ||||
|   int i, j, k; | ||||
| 
 | ||||
|   for (i = 0; i < n; i++) { | ||||
| 
 | ||||
|     bb = *(b + i); | ||||
| 
 | ||||
|     for (j = 0; j < m; j ++) { | ||||
|       aa = *(c + j + i * ldc); | ||||
|       aa *= bb; | ||||
|       *a  = aa; | ||||
|       *(c + j + i * ldc) = aa; | ||||
|       a ++; | ||||
| 
 | ||||
|       for (k = i + 1; k < n; k ++){ | ||||
| 	*(c + j + k * ldc) -= aa * *(b + k); | ||||
|       } | ||||
| 
 | ||||
|     } | ||||
|     b += n; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | ||||
| 
 | ||||
|   FLOAT aa1, aa2; | ||||
|   FLOAT bb1, bb2; | ||||
|   FLOAT cc1, cc2; | ||||
| 
 | ||||
|   int i, j, k; | ||||
| 
 | ||||
|   ldc *= 2; | ||||
| 
 | ||||
|   for (i = 0; i < n; i++) { | ||||
| 
 | ||||
|     bb1 = *(b + i * 2 + 0); | ||||
|     bb2 = *(b + i * 2 + 1); | ||||
| 
 | ||||
|     for (j = 0; j < m; j ++) { | ||||
|       aa1 = *(c + j * 2 + 0 + i * ldc); | ||||
|       aa2 = *(c + j * 2 + 1 + i * ldc); | ||||
| 
 | ||||
| #ifndef CONJ | ||||
|       cc1 = aa1 * bb1 - aa2 * bb2; | ||||
|       cc2 = aa1 * bb2 + aa2 * bb1; | ||||
| #else | ||||
|       cc1 =  aa1 * bb1 + aa2 * bb2; | ||||
|       cc2 = -aa1 * bb2 + aa2 * bb1; | ||||
| #endif | ||||
| 
 | ||||
|       *(a + 0) = cc1; | ||||
|       *(a + 1) = cc2; | ||||
|       *(c + j * 2 + 0 + i * ldc) = cc1; | ||||
|       *(c + j * 2 + 1 + i * ldc) = cc2; | ||||
|       a += 2; | ||||
| 
 | ||||
|       for (k = i + 1; k < n; k ++){ | ||||
| #ifndef CONJ | ||||
| 	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | ||||
| 	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | ||||
| #else | ||||
| 	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | ||||
| 	*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | ||||
| #endif | ||||
|       } | ||||
| 
 | ||||
|     } | ||||
|     b += n * 2; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | ||||
| #ifdef COMPLEX | ||||
| 	   FLOAT dummy2, | ||||
| #endif | ||||
| 	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | ||||
| 
 | ||||
|   FLOAT *aa, *cc; | ||||
|   BLASLONG  kk; | ||||
|   BLASLONG i, j, jj; | ||||
| #ifdef DOUBLE | ||||
|   int sve_size = svcntd(); | ||||
| #else | ||||
|   int sve_size = svcntw(); | ||||
| #endif | ||||
| 
 | ||||
| #if 0 | ||||
|   fprintf(stderr, "TRSM RN KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n", | ||||
| 	  m, n, k, offset); | ||||
| #endif | ||||
| 
 | ||||
|   jj = 0; | ||||
|   j = (n >> GEMM_UNROLL_N_SHIFT); | ||||
|   kk = -offset; | ||||
| 
 | ||||
|   while (j > 0) { | ||||
| 
 | ||||
|     aa = a; | ||||
|     cc = c; | ||||
| 
 | ||||
|     i = sve_size; | ||||
| 
 | ||||
|     if (i <= m) { | ||||
|       do { | ||||
| 	if (kk > 0) { | ||||
| 	  GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, | ||||
| #ifdef COMPLEX | ||||
| 		      ZERO, | ||||
| #endif | ||||
| 		      aa, b, cc, ldc); | ||||
| 	} | ||||
| 
 | ||||
| 	solve(sve_size, GEMM_UNROLL_N, | ||||
| 	      aa + kk * sve_size * COMPSIZE, | ||||
| 	      b  + kk * GEMM_UNROLL_N * COMPSIZE, | ||||
| 	      cc, ldc); | ||||
| 
 | ||||
| 	aa += sve_size * k * COMPSIZE; | ||||
| 	cc += sve_size     * COMPSIZE; | ||||
| 	i += sve_size; | ||||
|       } while (i <= m); | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
|     i = m % sve_size; | ||||
|     if (i) { | ||||
|       if (kk > 0) { | ||||
|         GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|             ZERO, | ||||
| #endif | ||||
|             aa, b, cc, ldc); | ||||
|       } | ||||
|       solve(i, GEMM_UNROLL_N, | ||||
|           aa + kk * i             * COMPSIZE, | ||||
|           b  + kk * GEMM_UNROLL_N * COMPSIZE, | ||||
|           cc, ldc); | ||||
| 
 | ||||
|       aa += i * k * COMPSIZE; | ||||
|       cc += i     * COMPSIZE; | ||||
| 
 | ||||
|     } | ||||
| 
 | ||||
|     kk += GEMM_UNROLL_N; | ||||
|     b += GEMM_UNROLL_N * k   * COMPSIZE; | ||||
|     c += GEMM_UNROLL_N * ldc * COMPSIZE; | ||||
|     j --; | ||||
|     jj += sve_size; | ||||
|   } | ||||
| 
 | ||||
|   if (n & (GEMM_UNROLL_N - 1)) { | ||||
| 
 | ||||
|     j = (GEMM_UNROLL_N >> 1); | ||||
|     while (j > 0) { | ||||
|       if (n & j) { | ||||
| 
 | ||||
| 	aa = a; | ||||
| 	cc = c; | ||||
| 
 | ||||
|   i = sve_size; | ||||
| 
 | ||||
| 	while (i <= m) { | ||||
| 	  if (kk > 0) { | ||||
| 	    GEMM_KERNEL(sve_size, j, kk, dm1, | ||||
| #ifdef COMPLEX | ||||
| 			ZERO, | ||||
| #endif | ||||
| 			aa, | ||||
| 			b, | ||||
| 			cc, | ||||
| 			ldc); | ||||
| 	  } | ||||
| 
 | ||||
| 	  solve(sve_size, j, | ||||
| 		aa + kk * sve_size * COMPSIZE, | ||||
| 		b  + kk * j             * COMPSIZE, cc, ldc); | ||||
| 
 | ||||
| 	  aa += sve_size * k * COMPSIZE; | ||||
| 	  cc += sve_size     * COMPSIZE; | ||||
| 	  i += sve_size; | ||||
| 	} | ||||
| 
 | ||||
|   i = m % sve_size; | ||||
|   if (i) { | ||||
| 	      if (kk > 0) { | ||||
| 		GEMM_KERNEL(i, j, kk, dm1, | ||||
| #ifdef COMPLEX | ||||
| 			    ZERO, | ||||
| #endif | ||||
| 			    aa, | ||||
| 			    b, | ||||
| 			    cc, | ||||
| 			    ldc); | ||||
| 	      } | ||||
| 
 | ||||
| 	      solve(i, j, | ||||
| 		    aa + kk * i * COMPSIZE, | ||||
| 		    b  + kk * j * COMPSIZE, cc, ldc); | ||||
| 
 | ||||
| 	      aa += i * k * COMPSIZE; | ||||
| 	      cc += i     * COMPSIZE; | ||||
| 
 | ||||
|   } | ||||
| 
 | ||||
| 	b += j * k   * COMPSIZE; | ||||
| 	c += j * ldc * COMPSIZE; | ||||
| 	kk += j; | ||||
|       } | ||||
|       j >>= 1; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
|  | @ -0,0 +1,317 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| static FLOAT dm1 = -1.; | ||||
| 
 | ||||
| #ifdef CONJ | ||||
| #define GEMM_KERNEL   GEMM_KERNEL_R | ||||
| #else | ||||
| #define GEMM_KERNEL   GEMM_KERNEL_N | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 1 | ||||
| #define GEMM_UNROLL_N_SHIFT 0 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 2 | ||||
| #define GEMM_UNROLL_N_SHIFT 1 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 4 | ||||
| #define GEMM_UNROLL_N_SHIFT 2 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 8 | ||||
| #define GEMM_UNROLL_N_SHIFT 3 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMM_DEFAULT_UNROLL_N == 16 | ||||
| #define GEMM_UNROLL_N_SHIFT 4 | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| #ifndef COMPLEX | ||||
| 
 | ||||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | ||||
| 
 | ||||
|   FLOAT aa,  bb; | ||||
| 
 | ||||
|   int i, j, k; | ||||
| 
 | ||||
|   a += (n - 1) * m; | ||||
|   b += (n - 1) * n; | ||||
| 
 | ||||
|   for (i = n - 1; i >= 0; i--) { | ||||
| 
 | ||||
|     bb = *(b + i); | ||||
| 
 | ||||
|     for (j = 0; j < m; j ++) { | ||||
|       aa = *(c + j + i * ldc); | ||||
|       aa *= bb; | ||||
|       *a   = aa; | ||||
|       *(c + j + i * ldc) = aa; | ||||
|       a ++; | ||||
| 
 | ||||
|       for (k = 0; k < i; k ++){ | ||||
| 	*(c + j + k * ldc) -= aa * *(b + k); | ||||
|       } | ||||
| 
 | ||||
|     } | ||||
|     b -= n; | ||||
|     a -= 2 * m; | ||||
|   } | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | ||||
| 
 | ||||
|   FLOAT aa1, aa2; | ||||
|   FLOAT bb1, bb2; | ||||
|   FLOAT cc1, cc2; | ||||
| 
 | ||||
|   int i, j, k; | ||||
| 
 | ||||
|   ldc *= 2; | ||||
| 
 | ||||
|   a += (n - 1) * m * 2; | ||||
|   b += (n - 1) * n * 2; | ||||
| 
 | ||||
|   for (i = n - 1; i >= 0; i--) { | ||||
| 
 | ||||
|     bb1 = *(b + i * 2 + 0); | ||||
|     bb2 = *(b + i * 2 + 1); | ||||
| 
 | ||||
|     for (j = 0; j < m; j ++) { | ||||
| 
 | ||||
|       aa1 = *(c + j * 2 + 0 + i * ldc); | ||||
|       aa2 = *(c + j * 2 + 1 + i * ldc); | ||||
| 
 | ||||
| #ifndef CONJ | ||||
|       cc1 = aa1 * bb1 - aa2 * bb2; | ||||
|       cc2 = aa1 * bb2 + aa2 * bb1; | ||||
| #else | ||||
|       cc1 =  aa1 * bb1  + aa2 * bb2; | ||||
|       cc2 = - aa1 * bb2 + aa2 * bb1; | ||||
| #endif | ||||
| 
 | ||||
|       *(a + 0) = cc1; | ||||
|       *(a + 1) = cc2; | ||||
| 
 | ||||
|       *(c + j * 2 + 0 + i * ldc) = cc1; | ||||
|       *(c + j * 2 + 1 + i * ldc) = cc2; | ||||
|       a += 2; | ||||
| 
 | ||||
|       for (k = 0; k < i; k ++){ | ||||
| #ifndef CONJ | ||||
| 	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | ||||
| 	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | ||||
| #else | ||||
| 	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | ||||
| 	*(c + j * 2 + 1 + k * ldc) -=  -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | ||||
| #endif | ||||
|       } | ||||
| 
 | ||||
|     } | ||||
|     b -= n * 2; | ||||
|     a -= 4 * m; | ||||
|   } | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1, | ||||
| #ifdef COMPLEX | ||||
|     FLOAT dummy2, | ||||
| #endif | ||||
|     FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | ||||
| 
 | ||||
|   BLASLONG i, j; | ||||
|   FLOAT *aa, *cc; | ||||
|   BLASLONG  kk; | ||||
| #ifdef DOUBLE | ||||
|   int sve_size = svcntd(); | ||||
| #else | ||||
|   int sve_size = svcntw(); | ||||
| #endif | ||||
| 
 | ||||
| #if 0 | ||||
|   fprintf(stderr, "TRSM RT KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n", | ||||
|       m, n, k, offset); | ||||
| #endif | ||||
| 
 | ||||
|   kk = n - offset; | ||||
|   c += n * ldc * COMPSIZE; | ||||
|   b += n * k   * COMPSIZE; | ||||
| 
 | ||||
|   if (n & (GEMM_UNROLL_N - 1)) { | ||||
| 
 | ||||
|     j = 1; | ||||
|     while (j < GEMM_UNROLL_N) { | ||||
|       if (n & j) { | ||||
| 
 | ||||
|         aa  = a; | ||||
|         b -= j * k  * COMPSIZE; | ||||
|         c -= j * ldc* COMPSIZE; | ||||
|         cc  = c; | ||||
| 
 | ||||
|         i = sve_size; | ||||
|         if (i <= m) { | ||||
| 
 | ||||
|           do { | ||||
|             if (k - kk > 0) { | ||||
|               GEMM_KERNEL(sve_size, j, k - kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|                   ZERO, | ||||
| #endif | ||||
|                   aa + sve_size * kk * COMPSIZE, | ||||
|                   b  +  j            * kk * COMPSIZE, | ||||
|                   cc, | ||||
|                   ldc); | ||||
|             } | ||||
| 
 | ||||
|             solve(sve_size, j, | ||||
|                 aa + (kk - j) * sve_size * COMPSIZE, | ||||
|                 b  + (kk - j) * j             * COMPSIZE, | ||||
|                 cc, ldc); | ||||
| 
 | ||||
|             aa += sve_size * k * COMPSIZE; | ||||
|             cc += sve_size     * COMPSIZE; | ||||
|             i += sve_size; | ||||
|           } while (i <= m); | ||||
|         } | ||||
| 
 | ||||
|         i = m % sve_size; | ||||
|         if (i) { | ||||
|           if (k - kk > 0) { | ||||
|             GEMM_KERNEL(i, j, k - kk, dm1, | ||||
| #ifdef COMPLEX | ||||
|                 ZERO, | ||||
| #endif | ||||
|                 aa + i * kk * COMPSIZE, | ||||
|                 b  + j * kk * COMPSIZE, | ||||
|                 cc, ldc); | ||||
|           } | ||||
| 
 | ||||
|           solve(i, j, | ||||
|               aa + (kk - j) * i * COMPSIZE, | ||||
|               b  + (kk - j) * j * COMPSIZE, | ||||
|               cc, ldc); | ||||
| 
 | ||||
|           aa += i * k * COMPSIZE; | ||||
|           cc += i     * COMPSIZE; | ||||
| 
 | ||||
|         } | ||||
|         kk -= j; | ||||
|       } | ||||
|       j <<= 1; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   j = (n >> GEMM_UNROLL_N_SHIFT); | ||||
| 
 | ||||
|   if (j > 0) { | ||||
| 
 | ||||
|     do { | ||||
|       aa  = a; | ||||
|       b -= GEMM_UNROLL_N * k   * COMPSIZE; | ||||
|       c -= GEMM_UNROLL_N * ldc * COMPSIZE; | ||||
|       cc  = c; | ||||
| 
 | ||||
|       i = sve_size; | ||||
|       if (i <= m) { | ||||
| 	do { | ||||
| 	  if (k - kk > 0) { | ||||
| 	    GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, | ||||
| #ifdef COMPLEX | ||||
| 			ZERO, | ||||
| #endif | ||||
| 			aa + sve_size * kk * COMPSIZE, | ||||
| 			b  + GEMM_UNROLL_N * kk * COMPSIZE, | ||||
| 			cc, | ||||
| 			ldc); | ||||
| 	  } | ||||
| 
 | ||||
| 	  solve(sve_size, GEMM_UNROLL_N, | ||||
| 		aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE, | ||||
| 		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | ||||
| 		cc, ldc); | ||||
| 
 | ||||
| 	  aa += sve_size * k * COMPSIZE; | ||||
| 	  cc += sve_size     * COMPSIZE; | ||||
| 	  i += sve_size; | ||||
| 	} while (i <= m); | ||||
|       } | ||||
| 
 | ||||
|       i = m % sve_size; | ||||
|       if (i) { | ||||
| 	    if (k - kk > 0) { | ||||
| 	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, | ||||
| #ifdef COMPLEX | ||||
| 			  ZERO, | ||||
| #endif | ||||
| 			  aa + i             * kk * COMPSIZE, | ||||
| 			  b  + GEMM_UNROLL_N * kk * COMPSIZE, | ||||
| 			  cc, | ||||
| 			  ldc); | ||||
| 	    } | ||||
| 
 | ||||
| 	    solve(i, GEMM_UNROLL_N, | ||||
| 		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE, | ||||
| 		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | ||||
| 		  cc, ldc); | ||||
| 
 | ||||
| 	    aa += i * k * COMPSIZE; | ||||
| 	    cc += i     * COMPSIZE; | ||||
| 
 | ||||
|       } | ||||
| 
 | ||||
|       kk -= GEMM_UNROLL_N; | ||||
|       j --; | ||||
|     } while (j > 0); | ||||
|   } | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
|  | @ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| #ifndef UNIT | ||||
| #define INV(a) (ONE / (a)) | ||||
| #else | ||||
| #define INV(a) (ONE) | ||||
| #endif | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, ii, jj; | ||||
| 
 | ||||
|   FLOAT *ao; | ||||
| 
 | ||||
|   jj = offset; | ||||
| #ifdef DOUBLE | ||||
|   int64_t js = 0; | ||||
|   svint64_t index = svindex_s64(0LL, lda); | ||||
|   svbool_t pn = svwhilelt_b64(js, n); | ||||
|   int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|   int32_t N = n; | ||||
|   int32_t js = 0; | ||||
|   svint32_t index = svindex_s32(0, lda); | ||||
|   svbool_t pn = svwhilelt_b32(js, N); | ||||
|   int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|   do { | ||||
| 
 | ||||
|     ao = a; | ||||
| 
 | ||||
|     i = 0; | ||||
|     ii = 0; | ||||
|     do { | ||||
| 
 | ||||
|       if (ii == jj) { | ||||
|         for (int j = 0; j < n_active; j++) { | ||||
|           for (int k = 0; k < j; k++) { | ||||
|             *(b + j * n_active + k) = *(ao + k * lda + j); | ||||
|           } | ||||
|           *(b + j * n_active + j) = INV(*(ao + j * lda + j)); | ||||
|         } | ||||
|         ao += n_active; | ||||
|         b += n_active * n_active; | ||||
|         i += n_active; | ||||
|         ii += n_active; | ||||
|       } else { | ||||
|         if (ii > jj) { | ||||
| #ifdef DOUBLE | ||||
|           svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | ||||
| #else | ||||
|           svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | ||||
| #endif | ||||
|           svst1(pn, b, aj_vec); | ||||
|         } | ||||
|         ao++; | ||||
|         b += n_active; | ||||
|         i++; | ||||
|         ii++; | ||||
|       } | ||||
|     } while (i < m); | ||||
| 
 | ||||
| 
 | ||||
|     a += n_active * lda; | ||||
|     jj += n_active; | ||||
| 
 | ||||
|     js += n_active; | ||||
| #ifdef DOUBLE | ||||
|     pn = svwhilelt_b64(js, n); | ||||
|     n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|   } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|     pn = svwhilelt_b32(js, N); | ||||
|     n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|   } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
| return 0; | ||||
| } | ||||
|  | @ -0,0 +1,117 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| #ifndef UNIT | ||||
| #define INV(a) (ONE / (a)) | ||||
| #else | ||||
| #define INV(a) (ONE) | ||||
| #endif | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, ii, jj; | ||||
| 
 | ||||
|   FLOAT *ao; | ||||
| 
 | ||||
|   jj = offset; | ||||
| #ifdef DOUBLE | ||||
|   int64_t js = 0; | ||||
|   svbool_t pn = svwhilelt_b64(js, n); | ||||
|   int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|   int32_t N = n; | ||||
|   int32_t js = 0; | ||||
|   svbool_t pn = svwhilelt_b32(js, N); | ||||
|   int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|   do { | ||||
| 
 | ||||
|     ao = a; | ||||
| 
 | ||||
|     i = 0; | ||||
|     ii = 0; | ||||
|     do { | ||||
| 
 | ||||
|       if (ii == jj) { | ||||
|         for (int j = 0; j < n_active; j++) { | ||||
|           *(b + j * n_active + j) = INV(*(ao + j * lda + j)); | ||||
|           for (int k = j+1; k < n_active; k++) { | ||||
|             *(b + j * n_active + k) = *(ao + j * lda + k); | ||||
|           } | ||||
|         } | ||||
|         b += n_active * n_active; | ||||
|         ao += lda * n_active; | ||||
|         i += n_active; | ||||
|         ii += n_active; | ||||
|       } else { | ||||
|         if (ii < jj) { | ||||
| #ifdef DOUBLE | ||||
|           svfloat64_t aj_vec = svld1(pn, ao); | ||||
| #else | ||||
|           svfloat32_t aj_vec = svld1(pn, ao); | ||||
| #endif | ||||
|           svst1(pn, b, aj_vec); | ||||
|         } | ||||
|         ao += lda; | ||||
|         b += n_active; | ||||
|         i ++; | ||||
|         ii ++; | ||||
|       } | ||||
|     } while (i < m); | ||||
| 
 | ||||
| 
 | ||||
|     a += n_active; | ||||
|     jj += n_active; | ||||
| 
 | ||||
|     js += n_active; | ||||
| #ifdef DOUBLE | ||||
|     pn = svwhilelt_b64(js, n); | ||||
|     n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|   } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|     pn = svwhilelt_b32(js, N); | ||||
|     n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|   } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
| return 0; | ||||
| } | ||||
|  | @ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| #ifndef UNIT | ||||
| #define INV(a) (ONE / (a)) | ||||
| #else | ||||
| #define INV(a) (ONE) | ||||
| #endif | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, ii, jj; | ||||
| 
 | ||||
|   FLOAT *ao; | ||||
| 
 | ||||
|   jj = offset; | ||||
| #ifdef DOUBLE | ||||
|   int64_t js = 0; | ||||
|   svint64_t index = svindex_s64(0LL, lda); | ||||
|   svbool_t pn = svwhilelt_b64(js, n); | ||||
|   int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|   int32_t N = n; | ||||
|   int32_t js = 0; | ||||
|   svint32_t index = svindex_s32(0, lda); | ||||
|   svbool_t pn = svwhilelt_b32(js, N); | ||||
|   int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|   do { | ||||
| 
 | ||||
|     ao = a; | ||||
| 
 | ||||
|     i = 0; | ||||
|     ii = 0; | ||||
|     do { | ||||
| 
 | ||||
|       if (ii == jj) { | ||||
|         for (int j = 0; j < n_active; j++) { | ||||
|           *(b + j * n_active + j) = INV(*(ao + j * lda + j)); | ||||
|           for (int k = j+1; k < n_active; k++) { | ||||
|             *(b + j * n_active + k) = *(ao + k * lda + j); | ||||
|           } | ||||
|         } | ||||
|         ao += n_active; | ||||
|         b += n_active * n_active; | ||||
|         i += n_active; | ||||
|         ii += n_active; | ||||
|       } else { | ||||
|         if (ii < jj) { | ||||
| #ifdef DOUBLE | ||||
|           svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | ||||
| #else | ||||
|           svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | ||||
| #endif | ||||
|           svst1(pn, b, aj_vec); | ||||
|         } | ||||
|         ao++; | ||||
|         b += n_active; | ||||
|         i++; | ||||
|         ii++; | ||||
|       } | ||||
|     } while (i < m); | ||||
| 
 | ||||
| 
 | ||||
|     a += n_active * lda; | ||||
|     jj += n_active; | ||||
| 
 | ||||
|     js += n_active; | ||||
| #ifdef DOUBLE | ||||
|     pn = svwhilelt_b64(js, n); | ||||
|     n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|   } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|     pn = svwhilelt_b32(js, N); | ||||
|     n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|   } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
| return 0; | ||||
| } | ||||
|  | @ -0,0 +1,117 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| #ifndef UNIT | ||||
| #define INV(a) (ONE / (a)) | ||||
| #else | ||||
| #define INV(a) (ONE) | ||||
| #endif | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, ii, jj; | ||||
| 
 | ||||
|   FLOAT *ao; | ||||
| 
 | ||||
|   jj = offset; | ||||
| #ifdef DOUBLE | ||||
|   int64_t js = 0; | ||||
|   svbool_t pn = svwhilelt_b64(js, n); | ||||
|   int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|   int32_t N = n; | ||||
|   int32_t js = 0; | ||||
|   svbool_t pn = svwhilelt_b32(js, N); | ||||
|   int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|   do { | ||||
| 
 | ||||
|     ao = a; | ||||
| 
 | ||||
|     i = 0; | ||||
|     ii = 0; | ||||
|     do { | ||||
| 
 | ||||
|       if (ii == jj) { | ||||
|         for (int j = 0; j < n_active; j++) { | ||||
|           for (int k = 0; k < j; k++) { | ||||
|             *(b + j * n_active + k) = *(ao + j * lda + k); | ||||
|           } | ||||
|           *(b + j * n_active + j) = INV(*(ao + j * lda + j)); | ||||
|         } | ||||
|         ao += lda * n_active; | ||||
|         b += n_active * n_active; | ||||
|         i += n_active; | ||||
|         ii += n_active; | ||||
|       } else { | ||||
|         if (ii > jj) { | ||||
| #ifdef DOUBLE | ||||
|           svfloat64_t aj_vec = svld1(pn, ao); | ||||
| #else | ||||
|           svfloat32_t aj_vec = svld1(pn, ao); | ||||
| #endif | ||||
|           svst1(pn, b, aj_vec); | ||||
|         } | ||||
|         ao += lda; | ||||
|         b += n_active; | ||||
|         i ++; | ||||
|         ii ++; | ||||
|       }  | ||||
|     } while (i < m); | ||||
| 
 | ||||
| 
 | ||||
|     a += n_active; | ||||
|     jj += n_active; | ||||
| 
 | ||||
|     js += n_active; | ||||
| #ifdef DOUBLE | ||||
|     pn = svwhilelt_b64(js, n); | ||||
|     n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|   } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|     pn = svwhilelt_b32(js, N); | ||||
|     n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|   } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
| return 0; | ||||
| } | ||||
|  | @ -0,0 +1,874 @@ | |||
| /******************************************************************************* | ||||
| Copyright (c) 2015, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
| notice, this list of conditions and the following disclaimer. | ||||
| 2. Redistributions in binary form must reproduce the above copyright | ||||
| notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | ||||
| distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | ||||
| its contributors may be used to endorse or promote products | ||||
| derived from this software without specific prior written permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *******************************************************************************/ | ||||
| 
 | ||||
| #define ASSEMBLER | ||||
| #include "common.h" | ||||
| 
 | ||||
| /*                   X0          X1          X2          s0        X3        x4       x5           x6 */ | ||||
| /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ | ||||
| 
 | ||||
| #define origM		x0 | ||||
| #define origN		x1 | ||||
| #define origK		x2 | ||||
| #define origPA		x3 | ||||
| #define origPB		x4 | ||||
| #define pC		x5 | ||||
| #define LDC		x6 | ||||
| #define temp		x7 | ||||
| #define counterL	x8 | ||||
| #define counterI	x9 | ||||
| #define counterJ	x10 | ||||
| #define pB		x11 | ||||
| #define pCRow0		x12 | ||||
| #define pCRow1		x13 | ||||
| #define pCRow2		x14 | ||||
| #define pCRow3		x15 | ||||
| #define pA		x16 | ||||
| #define lanes		x17 | ||||
| 
 | ||||
| #define alphaR		x19 | ||||
| #define alphaI		x20 | ||||
| 
 | ||||
| #define alphaz_R	z6.d | ||||
| #define alphaz_I	z7.d | ||||
| #define alpha0_R	d6 | ||||
| #define alpha0_I	d7 | ||||
| 
 | ||||
| 
 | ||||
| #define A_PRE_SIZE	2560 | ||||
| #define B_PRE_SIZE	448 | ||||
| #define C_PRE_SIZE	128 | ||||
| 
 | ||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| #define OP_rr		fmla | ||||
| #define OP_ii		fmls | ||||
| #define OP_ri		fmla | ||||
| #define OP_ir		fmla | ||||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | ||||
| #define OP_rr		fmla | ||||
| #define OP_ii		fmla | ||||
| #define OP_ri		fmls | ||||
| #define OP_ir		fmla | ||||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | ||||
| #define OP_rr		fmla | ||||
| #define OP_ii		fmla | ||||
| #define OP_ri		fmla | ||||
| #define OP_ir		fmls | ||||
| #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| #define OP_rr		fmla | ||||
| #define OP_ii		fmls | ||||
| #define OP_ri		fmls | ||||
| #define OP_ir		fmls | ||||
| #endif | ||||
| 
 | ||||
| // 00 origM | ||||
| // 01 origN | ||||
| // 02 origK | ||||
| // 03 origPA | ||||
| // 04 origPB | ||||
| // 05 pC | ||||
| // 06 origLDC -> LDC | ||||
| // 07 offset -> temp | ||||
| // 08 counterL | ||||
| // 09 counterI | ||||
| // 10 counterJ | ||||
| // 11 pB | ||||
| // 12 pCRow0 | ||||
| // 13 pCRow1 | ||||
| // 14 pCRow2 | ||||
| // 15 pCRow3 | ||||
| // 16 pA | ||||
| // 17 alpha_save_R | ||||
| // 18 must save alpha_save_I | ||||
| // 19 must save | ||||
| // 20 must save | ||||
| // 21 must save | ||||
| // 22 must save | ||||
| // 23 must save | ||||
| // 24 must save | ||||
| // 25 must save | ||||
| // 26 must save | ||||
| // 27 must save | ||||
| // 28 must save | ||||
| // 29 frame | ||||
| // 30 link | ||||
| // 31 sp | ||||
| 
 | ||||
| //v00 ALPHA_R -> pA00_R, pA01_R | ||||
| //v01 ALPHA_I -> pA00_I, pA01_I | ||||
| //v02 pA02_R, pA03_R | ||||
| //v03 pA02_I, pA03_I | ||||
| //v04 pA10_R, pA11_R | ||||
| //v05 pA10_I, pA11_I | ||||
| //v06 pA12_R, pA13_R | ||||
| //v07 pA12_I, pA13_I | ||||
| //v08 must save pB00_R, pB01_R | ||||
| //v09 must save pB00_I, pB01_I | ||||
| //v10 must save pB02_R, pB03_R OR ALPHA0_R | ||||
| //v11 must save pB02_I, pB03_I OR ALPHA0_I | ||||
| //v12 must save pB10_R, pB11_R | ||||
| //v13 must save pB10_I, pB11_I | ||||
| //v14 must save pB12_R, pB13_R OR ALPHA1_R | ||||
| //v15 must save pB12_I, pB13_I OR ALPHA1_R | ||||
| //v16 pC0R | ||||
| //v17 pC0I | ||||
| //v18 pC1R | ||||
| //v19 pC1I | ||||
| //v20 pC2R | ||||
| //v21 pC2I | ||||
| //v22 pC3R | ||||
| //v23 pC3I | ||||
| //v24 pC3R | ||||
| //v25 pC3I | ||||
| //v26 pC22_R, pC23_R | ||||
| //v27 pC22_I, pC23_I | ||||
| //v28 pC30_R, pC31_R | ||||
| //v29 pC30_I, pC31_I | ||||
| //v30 pC32_R, pC33_R | ||||
| //v31 pC32_I, pC33_I | ||||
| 
 | ||||
| /******************************************************************************* | ||||
| * Macro definitions | ||||
| *******************************************************************************/ | ||||
| 
 | ||||
| .macro INITv1x4
 | ||||
| 	dup		z16.d, #0 | ||||
| 	dup		z17.d, #0 | ||||
| 	dup		z18.d, #0 | ||||
| 	dup		z19.d, #0 | ||||
| 	dup		z20.d, #0 | ||||
| 	dup		z21.d, #0 | ||||
| 	dup		z22.d, #0 | ||||
| 	dup		z23.d, #0 | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_I
 | ||||
| 	ld2d	{z0.d, z1.d}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8 | ||||
| 	ld2d	{z2.d, z3.d}, p1/z, [pA] // next one | ||||
| 	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8 | ||||
| 
 | ||||
|     ld1rd  z8.d, p0/z,  [pB] | ||||
|     ld1rd  z9.d, p0/z,  [pB, 8] | ||||
|     ld1rd  z10.d, p0/z, [pB, 16] | ||||
|     ld1rd  z11.d, p0/z, [pB, 24] | ||||
|     ld1rd  z12.d, p0/z, [pB, 32] | ||||
|     ld1rd  z13.d, p0/z, [pB, 40] | ||||
|     ld1rd  z14.d, p0/z, [pB, 48] | ||||
|     ld1rd  z15.d, p0/z, [pB, 56] | ||||
| 
 | ||||
|     add pB, pB, 64 | ||||
| 
 | ||||
| 	fmla	z16.d, p1/m, z0.d, z8.d | ||||
| 	OP_ir	z17.d, p1/m, z1.d, z8.d | ||||
|     ld1rd  z8.d, p0/z,  [pB] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
|     defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| 	#eor	z17.16b, z17.16b, z17.16b | ||||
| 	fmls	z17.d, p1/m, z0.d, z9.d | ||||
| #else | ||||
| 	fmla	z17.d, p1/m, z0.d, z9.d | ||||
| #endif | ||||
| 	OP_ii	z16.d, p1/m, z1.d, z9.d | ||||
|     ld1rd  z9.d, p0/z,  [pB, 8] | ||||
| 
 | ||||
| 
 | ||||
| 	fmla	z18.d, p1/m, z0.d, z10.d | ||||
| 	OP_ir	z19.d, p1/m, z1.d, z10.d | ||||
|     ld1rd  z10.d, p0/z,  [pB, 16] | ||||
| 	OP_ii	z18.d, p1/m, z1.d, z11.d | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
|     defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| 	#eor	z19.16b, z21.16b, z21.16b | ||||
| 	fmls	z19.d, p1/m, z0.d, z11.d | ||||
| #else | ||||
| 	fmla	z19.d, p1/m, z0.d, z11.d | ||||
| #endif | ||||
|     ld1rd  z11.d, p0/z,  [pB, 24] | ||||
| 
 | ||||
| 
 | ||||
| 	fmla	z20.d, p1/m, z0.d, z12.d | ||||
| 	OP_ir	z21.d, p1/m, z1.d, z12.d | ||||
|     ld1rd  z12.d, p0/z,  [pB, 32] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
|     defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| 	#eor	z21.16b, z23.16b, z23.16b | ||||
| 	fmls	z21.d, p1/m, z0.d, z13.d | ||||
| #else | ||||
| 	fmla	z21.d, p1/m, z0.d, z13.d | ||||
| #endif | ||||
| 	OP_ii	z20.d, p1/m, z1.d, z13.d | ||||
|     ld1rd  z13.d, p0/z,  [pB, 40] | ||||
| 
 | ||||
| 
 | ||||
| 	fmla	z22.d, p1/m, z0.d, z14.d | ||||
| 	OP_ir	z23.d, p1/m, z1.d, z14.d | ||||
|     ld1rd  z14.d, p0/z,  [pB, 48] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
|     defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| 	#eor	z23.16b, z19.16b, z19.16b | ||||
| 	fmls	z23.d, p1/m, z0.d, z15.d | ||||
| #else | ||||
| 	fmla	z23.d, p1/m, z0.d, z15.d | ||||
| #endif | ||||
| 	OP_ii	z22.d, p1/m, z1.d, z15.d | ||||
|     ld1rd  z15.d, p0/z,  [pB, 56] | ||||
| 
 | ||||
|     add pB, pB, 64 | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64] | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_M1
 | ||||
| 	ld2d	{z2.d, z3.d}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8 | ||||
| 
 | ||||
| 	OP_rr	z16.d, p1/m, z0.d, z8.d | ||||
| 	OP_ir	z17.d, p1/m, z1.d, z8.d | ||||
|     ld1rd  z8.d, p0/z,  [pB] | ||||
| 	OP_ii	z16.d, p1/m, z1.d, z9.d | ||||
| 	OP_ri	z17.d, p1/m, z0.d, z9.d | ||||
|     ld1rd  z9.d, p0/z,  [pB, 8] | ||||
| 
 | ||||
| 	OP_rr	z18.d, p1/m, z0.d, z10.d | ||||
| 	OP_ir	z19.d, p1/m, z1.d, z10.d | ||||
|     ld1rd  z10.d, p0/z,  [pB, 16] | ||||
| 	OP_ii	z18.d, p1/m, z1.d, z11.d | ||||
| 	OP_ri	z19.d, p1/m, z0.d, z11.d | ||||
|     ld1rd  z11.d, p0/z,  [pB, 24] | ||||
| 
 | ||||
| 	OP_rr	z20.d, p1/m, z0.d, z12.d | ||||
| 	OP_ir	z21.d, p1/m, z1.d, z12.d | ||||
|     ld1rd  z12.d, p0/z,  [pB, 32] | ||||
| 	OP_ii	z20.d, p1/m, z1.d, z13.d | ||||
| 	OP_ri	z21.d, p1/m, z0.d, z13.d | ||||
|     ld1rd  z13.d, p0/z,  [pB, 40] | ||||
| 
 | ||||
| 	OP_rr	z22.d, p1/m, z0.d, z14.d | ||||
| 	OP_ir	z23.d, p1/m, z1.d, z14.d | ||||
|     ld1rd  z14.d, p0/z,  [pB, 48] | ||||
| 	OP_ii	z22.d, p1/m, z1.d, z15.d | ||||
| 	OP_ri	z23.d, p1/m, z0.d, z15.d | ||||
|     ld1rd  z15.d, p0/z,  [pB, 56] | ||||
| 
 | ||||
|     add pB, pB, 64 | ||||
| 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE] | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64] | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_M2
 | ||||
| 	ld2d	{z0.d, z1.d}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #4	// pA = pA + lanes *2 * 8 | ||||
| 
 | ||||
| 	OP_rr	z16.d, p1/m, z2.d, z8.d | ||||
| 	OP_ir	z17.d, p1/m, z3.d, z8.d | ||||
|     ld1rd  z8.d, p0/z,  [pB] | ||||
| 	OP_ii	z16.d, p1/m, z3.d, z9.d | ||||
| 	OP_ri	z17.d, p1/m, z2.d, z9.d | ||||
|     ld1rd  z9.d, p0/z,  [pB, 8] | ||||
| 
 | ||||
| 	OP_rr	z18.d, p1/m, z2.d, z10.d | ||||
| 	OP_ir	z19.d, p1/m, z3.d, z10.d | ||||
|     ld1rd  z10.d, p0/z,  [pB, 16] | ||||
| 	OP_ii	z18.d, p1/m, z3.d, z11.d | ||||
| 	OP_ri	z19.d, p1/m, z2.d, z11.d | ||||
|     ld1rd  z11.d, p0/z,  [pB, 24] | ||||
| 
 | ||||
| 	OP_rr	z20.d, p1/m, z2.d, z12.d | ||||
| 	OP_ir	z21.d, p1/m, z3.d, z12.d | ||||
|     ld1rd  z12.d, p0/z,  [pB, 32] | ||||
| 	OP_ii	z20.d, p1/m, z3.d, z13.d | ||||
| 	OP_ri	z21.d, p1/m, z2.d, z13.d | ||||
|     ld1rd  z13.d, p0/z,  [pB, 40] | ||||
| 
 | ||||
| 	OP_rr	z22.d, p1/m, z2.d, z14.d | ||||
| 	OP_ir	z23.d, p1/m, z3.d, z14.d | ||||
|     ld1rd  z14.d, p0/z,  [pB, 48] | ||||
| 	OP_ii	z22.d, p1/m, z3.d, z15.d | ||||
| 	OP_ri	z23.d, p1/m, z2.d, z15.d | ||||
|     ld1rd  z15.d, p0/z,  [pB, 56] | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE] | ||||
| 
 | ||||
|     add pB, pB, 64 | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64] | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_E
 | ||||
| 	OP_rr	z16.d, p1/m, z2.d, z8.d | ||||
| 	OP_ir	z17.d, p1/m, z3.d, z8.d | ||||
| 	OP_ii	z16.d, p1/m, z3.d, z9.d | ||||
| 	OP_ri	z17.d, p1/m, z2.d, z9.d | ||||
| 
 | ||||
| 	OP_rr	z18.d, p1/m, z2.d, z10.d | ||||
| 	OP_ir	z19.d, p1/m, z3.d, z10.d | ||||
| 	OP_ii	z18.d, p1/m, z3.d, z11.d | ||||
| 	OP_ri	z19.d, p1/m, z2.d, z11.d | ||||
| 
 | ||||
| 	OP_rr	z20.d, p1/m, z2.d, z12.d | ||||
| 	OP_ir	z21.d, p1/m, z3.d, z12.d | ||||
| 	OP_ii	z20.d, p1/m, z3.d, z13.d | ||||
| 	OP_ri	z21.d, p1/m, z2.d, z13.d | ||||
| 
 | ||||
| 	OP_rr	z22.d, p1/m, z2.d, z14.d | ||||
| 	OP_ir	z23.d, p1/m, z3.d, z14.d | ||||
| 	OP_ii	z22.d, p1/m, z3.d, z15.d | ||||
| 	OP_ri	z23.d, p1/m, z2.d, z15.d | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE] | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64] | ||||
| 
 | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x4_SUB
 | ||||
| 	ld2d	{z0.d, z1.d}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8 | ||||
| 
 | ||||
|     ld1rd  z8.d, p0/z,  [pB] | ||||
|     ld1rd  z9.d, p0/z,  [pB, 8] | ||||
|     ld1rd  z10.d, p0/z,  [pB, 16] | ||||
|     ld1rd  z11.d, p0/z,  [pB, 24] | ||||
| 
 | ||||
| 	OP_rr	z16.d, p1/m, z0.d, z8.d | ||||
| 	OP_ir	z17.d, p1/m, z1.d, z8.d | ||||
| 	OP_ii	z16.d, p1/m, z1.d, z9.d | ||||
| 	OP_ri	z17.d, p1/m, z0.d, z9.d | ||||
| 
 | ||||
|     ld1rd  z12.d, p0/z,  [pB, 32] | ||||
|     ld1rd  z13.d, p0/z,  [pB, 40] | ||||
|     ld1rd  z14.d, p0/z,  [pB, 48] | ||||
|     ld1rd  z15.d, p0/z,  [pB, 56] | ||||
| 
 | ||||
| 	OP_rr	z18.d, p1/m, z0.d, z10.d | ||||
| 	OP_ir	z19.d, p1/m, z1.d, z10.d | ||||
| 	OP_ii	z18.d, p1/m, z1.d, z11.d | ||||
| 	OP_ri	z19.d, p1/m, z0.d, z11.d | ||||
| 
 | ||||
|     add pB, pB, 64 | ||||
| 
 | ||||
| 	OP_rr	z20.d, p1/m, z0.d, z12.d | ||||
| 	OP_ir	z21.d, p1/m, z1.d, z12.d | ||||
| 	OP_ii	z20.d, p1/m, z1.d, z13.d | ||||
| 	OP_ri	z21.d, p1/m, z0.d, z13.d | ||||
| 
 | ||||
| 	OP_rr	z22.d, p1/m, z0.d, z14.d | ||||
| 	OP_ir	z23.d, p1/m, z1.d, z14.d | ||||
| 	OP_ii	z22.d, p1/m, z1.d, z15.d | ||||
| 	OP_ri	z23.d, p1/m, z0.d, z15.d | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE] | ||||
| 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE] | ||||
| .endm | ||||
| 
 | ||||
| .macro SAVEv1x4
 | ||||
| 	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0] | ||||
| 	fmla	z24.d, p1/m, z16.d, alphaz_R | ||||
| 	fmls	z24.d, p1/m, z17.d, alphaz_I | ||||
| 	fmla	z25.d, p1/m, z16.d, alphaz_I | ||||
| 	fmla	z25.d, p1/m, z17.d, alphaz_R | ||||
| 	st2d 	{z24.d, z25.d}, p1, [pCRow0] | ||||
| 
 | ||||
| 	add	pCRow0, pCRow0, lanes, lsl #4 | ||||
| 
 | ||||
| 	ld2d	{z26.d, z27.d}, p1/z, [pCRow1] | ||||
| 	fmla	z26.d, p1/m, z18.d, alphaz_R | ||||
| 	fmls	z26.d, p1/m, z19.d, alphaz_I | ||||
| 	fmla	z27.d, p1/m, z18.d, alphaz_I | ||||
| 	fmla	z27.d, p1/m, z19.d, alphaz_R | ||||
| 	st2d 	{z26.d, z27.d}, p1, [pCRow1] | ||||
| 
 | ||||
| 	add	pCRow1, pCRow1, lanes, lsl #4 | ||||
| 	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	ld2d	{z28.d, z29.d}, p1/z, [pCRow2] | ||||
| 	fmla	z28.d, p1/m, z20.d, alphaz_R | ||||
| 	fmls	z28.d, p1/m, z21.d, alphaz_I | ||||
| 	fmla	z29.d, p1/m, z20.d, alphaz_I | ||||
| 	fmla	z29.d, p1/m, z21.d, alphaz_R | ||||
| 	st2d 	{z28.d, z29.d}, p1, [pCRow2] | ||||
| 
 | ||||
| 	add	pCRow2, pCRow2, lanes, lsl #4 | ||||
| 
 | ||||
| 	ld2d	{z30.d, z31.d}, p1/z, [pCRow3] | ||||
| 	fmla	z30.d, p1/m, z22.d, alphaz_R | ||||
| 	fmls	z30.d, p1/m, z23.d, alphaz_I | ||||
| 	fmla	z31.d, p1/m, z22.d, alphaz_I | ||||
| 	fmla	z31.d, p1/m, z23.d, alphaz_R | ||||
| 	st2d 	{z30.d, z31.d}, p1, [pCRow3] | ||||
| 
 | ||||
| 	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8 | ||||
| 
 | ||||
| 	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | ||||
| 
 | ||||
| .endm | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| 
 | ||||
| .macro INITv1x2
 | ||||
| 	dup		z16.d, #0 | ||||
| 	dup		z17.d, #0 | ||||
| 	dup		z18.d, #0 | ||||
| 	dup		z19.d, #0 | ||||
| .endm | ||||
| 
 | ||||
| .macro KERNELv1x2_SUB
 | ||||
| 	ld2d	{z0.d, z1.d}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8 | ||||
| 
 | ||||
|     ld1rd  z8.d, p0/z,  [pB] | ||||
|     ld1rd  z9.d, p0/z,  [pB, 8] | ||||
|     ld1rd  z10.d, p0/z,  [pB, 16] | ||||
|     ld1rd  z11.d, p0/z,  [pB, 24] | ||||
| 
 | ||||
| 	OP_rr	z16.d, p1/m, z0.d, z8.d | ||||
| 	OP_ir	z17.d, p1/m, z1.d, z8.d | ||||
| 	OP_ii	z16.d, p1/m, z1.d, z9.d | ||||
| 	OP_ri	z17.d, p1/m, z0.d, z9.d | ||||
| 
 | ||||
| 	OP_rr	z18.d, p1/m, z0.d, z10.d | ||||
| 	OP_ir	z19.d, p1/m, z1.d, z10.d | ||||
| 	OP_ii	z18.d, p1/m, z1.d, z11.d | ||||
| 	OP_ri	z19.d, p1/m, z0.d, z11.d | ||||
| 
 | ||||
|     add pB, pB, 32 | ||||
| .endm | ||||
| 
 | ||||
| .macro SAVEv1x2
 | ||||
| 	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0] | ||||
| 	fmla	z24.d, p1/m, z16.d, alphaz_R | ||||
| 	fmls	z24.d, p1/m, z17.d, alphaz_I | ||||
| 	fmla	z25.d, p1/m, z16.d, alphaz_I | ||||
| 	fmla	z25.d, p1/m, z17.d, alphaz_R | ||||
| 	st2d 	{z24.d, z25.d}, p1, [pCRow0] | ||||
| 
 | ||||
| 	add	pCRow0, pCRow0, lanes, lsl #4 | ||||
| 
 | ||||
| 	ld2d	{z26.d, z27.d}, p1/z, [pCRow1] | ||||
| 	fmla	z26.d, p1/m, z18.d, alphaz_R | ||||
| 	fmls	z26.d, p1/m, z19.d, alphaz_I | ||||
| 	fmla	z27.d, p1/m, z18.d, alphaz_I | ||||
| 	fmla	z27.d, p1/m, z19.d, alphaz_R | ||||
| 	st2d 	{z26.d, z27.d}, p1, [pCRow1] | ||||
| 
 | ||||
| 	add	pCRow1, pCRow1, lanes, lsl #4 | ||||
| 	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | ||||
| 
 | ||||
| .endm | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| 
 | ||||
| .macro INITv1x1
 | ||||
| 	dup		z16.d, #0 | ||||
| 	dup		z17.d, #0 | ||||
| .endm | ||||
| 
 | ||||
| 
 | ||||
| .macro KERNELv1x1_SUB
 | ||||
| 	ld2d	{z0.d, z1.d}, p1/z, [pA] | ||||
| 	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8 | ||||
| 
 | ||||
|     ld1rd  z8.d, p0/z,  [pB] | ||||
|     ld1rd  z9.d, p0/z,  [pB, 8] | ||||
| 
 | ||||
|     add pB, pB, 16 | ||||
| 
 | ||||
| 	OP_rr	z16.d, p1/m, z0.d, z8.d | ||||
| 	OP_ir	z17.d, p1/m, z1.d, z8.d | ||||
| 	OP_ii	z16.d, p1/m, z1.d, z9.d | ||||
| 	OP_ri	z17.d, p1/m, z0.d, z9.d | ||||
| .endm | ||||
| 
 | ||||
| .macro SAVEv1x1
 | ||||
| 	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | ||||
| 
 | ||||
| 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0] | ||||
| 	fmla	z24.d, p1/m, z16.d, alphaz_R | ||||
| 	fmls	z24.d, p1/m, z17.d, alphaz_I | ||||
| 	fmla	z25.d, p1/m, z16.d, alphaz_I | ||||
| 	fmla	z25.d, p1/m, z17.d, alphaz_R | ||||
| 	st2d 	{z24.d, z25.d}, p1, [pCRow0] | ||||
| 
 | ||||
| 	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8 | ||||
| 
 | ||||
| 	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | ||||
| 
 | ||||
| .endm | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| /******************************************************************************* | ||||
| * End of macro definitions | ||||
| *******************************************************************************/ | ||||
| 
 | ||||
| 	PROLOGUE | ||||
| 
 | ||||
| 	.align 5
 | ||||
| 	add	sp, sp, #-(11 * 16) | ||||
| 	stp	d8, d9, [sp, #(0 * 16)] | ||||
| 	stp	d10, d11, [sp, #(1 * 16)] | ||||
| 	stp	d12, d13, [sp, #(2 * 16)] | ||||
| 	stp	d14, d15, [sp, #(3 * 16)] | ||||
| 	stp	d16, d17, [sp, #(4 * 16)] | ||||
| 	stp	x18, x19, [sp, #(5 * 16)] | ||||
| 	stp	x20, x21, [sp, #(6 * 16)] | ||||
| 	stp	x22, x23, [sp, #(7 * 16)] | ||||
| 	stp	x24, x25, [sp, #(8 * 16)] | ||||
| 	stp	x26, x27, [sp, #(9 * 16)] | ||||
| 	str	x28, [sp, #(10 * 16)] | ||||
| 
 | ||||
| 	prfm	PLDL1KEEP, [origPB] | ||||
| 	prfm	PLDL1KEEP, [origPA] | ||||
| 
 | ||||
| 	fmov	alphaR, d0 | ||||
| 	dup	    alphaz_R, alphaR | ||||
| 	fmov	alphaI, d1 | ||||
| 	dup	    alphaz_I, alphaI | ||||
| 
 | ||||
| 	lsl	LDC, LDC, #4			// ldc = ldc * 2 * 8 | ||||
|     ptrue p0.d                  // create true predicate  | ||||
| 
 | ||||
| 	mov	pB, origPB | ||||
| 
 | ||||
| // Loop over N | ||||
| 	mov	counterJ, origN | ||||
| 	asr 	counterJ, counterJ, #2		// J = J / 4 | ||||
| 	cmp 	counterJ, #0 | ||||
| 	ble	.Lzgemm_kernel_L2_BEGIN | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| .Lzgemm_kernel_L4_BEGIN: | ||||
| 	mov	pCRow0, pC | ||||
| 	add	pCRow1, pCRow0, LDC | ||||
| 	add	pCRow2, pCRow1, LDC | ||||
| 	add	pCRow3, pCRow2, LDC | ||||
| 
 | ||||
| 	add	pC, pCRow3, LDC | ||||
| 
 | ||||
| 	mov	pA, origPA			// pA = start of A array | ||||
| 
 | ||||
| .Lzgemm_kernel_L4_Mv1_BEGIN: | ||||
| 
 | ||||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | ||||
|     mov counterI, #0 | ||||
|     whilelt p1.d, counterI, origM    | ||||
|     cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lzgemm_kernel_L4_Mv1_20: | ||||
| 
 | ||||
| 	mov	pB, origPB | ||||
|     INITv1x4                     // fill with zeros | ||||
| 
 | ||||
| 	asr 	counterL , origK, #3 | ||||
| 	cmp	counterL , #2 | ||||
| 	blt	.Lzgemm_kernel_L4_Mv1_32 | ||||
| 
 | ||||
| 	KERNELv1x4_I | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 
 | ||||
| 	subs	counterL, counterL, #2		// subtract 2 | ||||
| 	ble	.Lzgemm_kernel_L4_Mv1_22a | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lzgemm_kernel_L4_Mv1_22: | ||||
| 
 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lzgemm_kernel_L4_Mv1_22 | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lzgemm_kernel_L4_Mv1_22a: | ||||
| 
 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_E | ||||
| 
 | ||||
| 	b	 .Lzgemm_kernel_L4_Mv1_44 | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lzgemm_kernel_L4_Mv1_32: | ||||
| 
 | ||||
| 	tst	counterL, #1 | ||||
| 	ble	.Lzgemm_kernel_L4_Mv1_40 | ||||
| 
 | ||||
| 	KERNELv1x4_I | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_M2 | ||||
| 	KERNELv1x4_M1 | ||||
| 	KERNELv1x4_E | ||||
| 
 | ||||
| 	b	.Lzgemm_kernel_L4_Mv1_44 | ||||
| 
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L4_Mv1_40: | ||||
| 
 | ||||
| 	INITv1x4 | ||||
| 
 | ||||
| .Lzgemm_kernel_L4_Mv1_44: | ||||
| 
 | ||||
| 	ands	counterL , origK, #7 | ||||
| 	ble	.Lzgemm_kernel_L4_Mv1_100 | ||||
| 
 | ||||
| 	.align 5
 | ||||
| .Lzgemm_kernel_L4_Mv1_46: | ||||
| 	KERNELv1x4_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bne	.Lzgemm_kernel_L4_Mv1_46 | ||||
| 
 | ||||
| .Lzgemm_kernel_L4_Mv1_100: | ||||
| 	prfm	PLDL1KEEP, [pA] | ||||
| 	prfm	PLDL1KEEP, [pA, #64] | ||||
| 	prfm	PLDL1KEEP, [origPB] | ||||
| 
 | ||||
| 	SAVEv1x4 | ||||
| 
 | ||||
| .Lzgemm_kernel_L4_Mv1_END: | ||||
| 
 | ||||
|     incd    counterI | ||||
|     whilelt p1.d, counterI, origM             //SVE instruction | ||||
|     cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension | ||||
|     b.any   .Lzgemm_kernel_L4_Mv1_20    | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L4_END: | ||||
| 
 | ||||
| 	lsl	temp, origK, #6 | ||||
| 	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2 | ||||
| 
 | ||||
| 	subs	counterJ, counterJ , #1		// j-- | ||||
| 	bgt	.Lzgemm_kernel_L4_BEGIN | ||||
| 
 | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| .Lzgemm_kernel_L2_BEGIN:   // less than 2 left in N direction | ||||
| 
 | ||||
| 	mov	counterJ , origN | ||||
| 	tst	counterJ , #3 | ||||
| 	ble	.Lzgemm_kernel_L999 | ||||
| 
 | ||||
| 	tst	counterJ , #2 | ||||
| 	ble	.Lzgemm_kernel_L1_BEGIN | ||||
| 
 | ||||
| 	mov	pCRow0, pC			// pCRow0 = pC | ||||
| 	add	pCRow1, pCRow0, LDC | ||||
| 
 | ||||
| 	add	pC,pC,LDC, lsl #1 | ||||
| 
 | ||||
| 	mov	pA, origPA			// pA = A | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L2_Mv1_BEGIN: | ||||
| 
 | ||||
|     mov counterI, #0 | ||||
|     whilelt p1.d, counterI, origM               //SVE instruction | ||||
|     cntp lanes, p0, p1.d | ||||
| 
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L2_Mv1_20: | ||||
| 
 | ||||
| 	INITv1x2 | ||||
| 
 | ||||
| 	mov	pB, origPB | ||||
| 	asr	counterL , origK, #3		// counterL = counterL / 8 | ||||
| 	cmp	counterL,#0 | ||||
| 	ble	.Lzgemm_kernel_L2_Mv1_40 | ||||
| 	.align 5
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L2_Mv1_22: | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 
 | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 	KERNELv1x2_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lzgemm_kernel_L2_Mv1_22 | ||||
| 
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L2_Mv1_40: | ||||
| 
 | ||||
| 	ands	counterL , origK, #7		// counterL = counterL % 8 | ||||
| 	ble	.Lzgemm_kernel_L2_Mv1_100 | ||||
| 
 | ||||
| .Lzgemm_kernel_L2_Mv1_42: | ||||
| 
 | ||||
| 	KERNELv1x2_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lzgemm_kernel_L2_Mv1_42 | ||||
| 
 | ||||
| .Lzgemm_kernel_L2_Mv1_100: | ||||
| 
 | ||||
| 	SAVEv1x2 | ||||
| 
 | ||||
| .Lzgemm_kernel_L2_Mv1_END: | ||||
| 
 | ||||
| 
 | ||||
|     incd    counterI | ||||
|     whilelt p1.d, counterI, origM             //SVE instruction | ||||
|     cntp lanes, p0, p1.d | ||||
|     b.any   .Lzgemm_kernel_L2_Mv1_20    | ||||
| 
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L2_END: | ||||
| 	lsl	temp, origK, #5 | ||||
| 	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2 | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| .Lzgemm_kernel_L1_BEGIN: | ||||
| 
 | ||||
| 	mov	counterJ , origN | ||||
| 	tst	counterJ , #1 | ||||
| 	ble	.Lzgemm_kernel_L999 // done | ||||
| 
 | ||||
| 
 | ||||
| 	mov	pCRow0, pC			// pCRow0 = C | ||||
| 	add	pC , pC , LDC			// Update pC to point to next | ||||
| 
 | ||||
| 	mov	pA, origPA			// pA = A | ||||
| 
 | ||||
| .Lzgemm_kernel_L1_Mv1_BEGIN: | ||||
| 
 | ||||
|     mov counterI, #0 | ||||
|     whilelt p1.d, counterI, origM               //SVE instruction | ||||
|     cntp lanes, p0, p1.d | ||||
| 
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L1_Mv1_20: | ||||
| 
 | ||||
| 	INITv1x1 | ||||
| 
 | ||||
| 	mov	pB, origPB | ||||
| 	asr	counterL , origK, #3		// counterL = counterL / 8 | ||||
| 	cmp	counterL , #0 | ||||
| 	ble	.Lzgemm_kernel_L1_Mv1_40 | ||||
| 	.align 5
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L1_Mv1_22: | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 
 | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 	KERNELv1x1_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lzgemm_kernel_L1_Mv1_22 | ||||
| 
 | ||||
| 
 | ||||
| .Lzgemm_kernel_L1_Mv1_40: | ||||
| 
 | ||||
| 	ands	counterL , origK, #7		// counterL = counterL % 8 | ||||
| 	ble	.Lzgemm_kernel_L1_Mv1_100 | ||||
| 
 | ||||
| .Lzgemm_kernel_L1_Mv1_42: | ||||
| 
 | ||||
| 	KERNELv1x1_SUB | ||||
| 
 | ||||
| 	subs	counterL, counterL, #1 | ||||
| 	bgt	.Lzgemm_kernel_L1_Mv1_42 | ||||
| 
 | ||||
| .Lzgemm_kernel_L1_Mv1_100: | ||||
| 
 | ||||
| 	SAVEv1x1 | ||||
| 
 | ||||
| .Lzgemm_kernel_L1_Mv1_END: | ||||
| 
 | ||||
|     incd    counterI | ||||
|     whilelt p1.d, counterI, origM             //SVE instruction | ||||
|     cntp lanes, p0, p1.d | ||||
|     b.any   .Lzgemm_kernel_L1_Mv1_20    | ||||
| 
 | ||||
| .Lzgemm_kernel_L1_END: | ||||
| 
 | ||||
| /******************************************************************************/ | ||||
| 
 | ||||
| .Lzgemm_kernel_L999: | ||||
| 	mov	x0, #0				// set return value | ||||
| 	ldp	d8, d9, [sp, #(0 * 16)] | ||||
| 	ldp	d10, d11, [sp, #(1 * 16)] | ||||
| 	ldp	d12, d13, [sp, #(2 * 16)] | ||||
| 	ldp	d14, d15, [sp, #(3 * 16)] | ||||
| 	ldp	d16, d17, [sp, #(4 * 16)] | ||||
| 	ldp	x18, x19, [sp, #(5 * 16)] | ||||
| 	ldp	x20, x21, [sp, #(6 * 16)] | ||||
| 	ldp	x22, x23, [sp, #(7 * 16)] | ||||
| 	ldp	x24, x25, [sp, #(8 * 16)] | ||||
| 	ldp	x26, x27, [sp, #(9 * 16)] | ||||
| 	ldr	x28, [sp, #(10 * 16)] | ||||
| 	add	sp, sp, #(11*16) | ||||
| 	ret | ||||
| 
 | ||||
| 	EPILOGUE | ||||
| 
 | ||||
|  | @ -0,0 +1,79 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include <arm_sve.h> | ||||
| 
 | ||||
| // TODO: write in assembly with proper unrolling of inner loop
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | ||||
| 
 | ||||
|     BLASLONG j; | ||||
|     IFLOAT *aoffset, *aoffset1, *boffset; | ||||
| 
 | ||||
|     svint64_t lda_vec = svindex_s64(0LL, lda * 2); | ||||
| 
 | ||||
|     aoffset = a; | ||||
|     boffset = b; | ||||
| 
 | ||||
|     j = 0; | ||||
|     svbool_t pg = svwhilelt_b64(j, n); | ||||
|     uint64_t active = svcntp_b64(svptrue_b64(), pg); | ||||
|     do { | ||||
| 
 | ||||
|         aoffset1 = aoffset; | ||||
| 
 | ||||
|         uint64_t i_cnt = m; | ||||
|         while (i_cnt--) { | ||||
|             svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | ||||
|             svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec); | ||||
|             svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag)); | ||||
|             aoffset1 += 2; | ||||
|             boffset += active * 2; | ||||
|         } | ||||
|         aoffset += active * lda * 2; | ||||
| 
 | ||||
|         j += svcntd(); | ||||
|         pg = svwhilelt_b64(j, n); | ||||
|         active = svcntp_b64(svptrue_b64(), pg); | ||||
| 
 | ||||
| 
 | ||||
|     } while (svptest_any(svptrue_b64(), pg)); | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
|  | @ -0,0 +1,75 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include <arm_sve.h> | ||||
| 
 | ||||
| // TODO: write in assembly with proper unrolling of inner loop
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | ||||
| 
 | ||||
|     BLASLONG j; | ||||
|     IFLOAT *aoffset, *aoffset1, *boffset; | ||||
| 
 | ||||
|     aoffset = a; | ||||
|     boffset = b; | ||||
| 
 | ||||
|     j = 0; | ||||
|     svbool_t pg = svwhilelt_b64(j, n); | ||||
|     uint64_t active = svcntp_b64(svptrue_b64(), pg); | ||||
|     do { | ||||
| 
 | ||||
|         aoffset1 = aoffset; | ||||
| 
 | ||||
|         uint64_t i_cnt = m; | ||||
|         while (i_cnt--) { | ||||
|             svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1); | ||||
|             svst2_f64(pg, (double *) boffset, a_vec); | ||||
|             aoffset1 += lda * 2; | ||||
|             boffset += active * 2; | ||||
|         } | ||||
|         aoffset += active * 2; | ||||
| 
 | ||||
|         j += svcntd(); | ||||
|         pg = svwhilelt_b64(j, n); | ||||
|         active = svcntp_b64(svptrue_b64(), pg); | ||||
| 
 | ||||
|     } while (svptest_any(svptrue_b64(), pg)); | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
|  | @ -0,0 +1,172 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include <arm_sve.h> | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | ||||
| 
 | ||||
| #if defined(DOUBLE) | ||||
|   BLASLONG offset, i; | ||||
| 
 | ||||
|   lda *= 2; | ||||
| 
 | ||||
|   uint64_t sve_size = svcntd(); | ||||
|   svint64_t posY_vec = svdup_s64(posY); | ||||
|   svint64_t posX_vec = svdup_s64(posX); | ||||
|   svint64_t lda_vec = svdup_s64(lda); | ||||
|   svint64_t one_vec = svdup_s64(1LL); | ||||
| 
 | ||||
|   int64_t j = 0; | ||||
|   svbool_t pg = svwhilelt_b64(j, n); | ||||
|   int64_t active = svcntp_b64(svptrue_b64(), pg); | ||||
|   svint64_t index_neg = svindex_s64(0LL, -1LL); | ||||
|   svint64_t index = svindex_s64(0LL, 1LL); | ||||
| 
 | ||||
|   do { | ||||
|     offset = posX - posY; | ||||
|     svint64_t vec_off = svdup_s64(offset); | ||||
|     svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|     svint64_t temp = svadd_z(pg, posX_vec, index); | ||||
|     svint64_t temp1 = svmul_z(pg, temp, 2); | ||||
|     temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | ||||
|     svint64_t temp2 = svmul_z(pg, temp, lda_vec); | ||||
|     temp2 = svmla_z(pg, temp2, posY_vec, 2); | ||||
|     svint64_t gat_ind = svsel(cmp, temp1, temp2); | ||||
| 
 | ||||
|     i = m; | ||||
|     while (i>0) { | ||||
|         svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | ||||
|         svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | ||||
| 
 | ||||
|         gat_ind = svadd_m(cmp, gat_ind, lda_vec); | ||||
|         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | ||||
|         if (offset <= 0) { | ||||
|             svbool_t off_g = svwhilelt_b64(offset, 0LL); | ||||
|             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | ||||
|         } | ||||
| 
 | ||||
|         svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | ||||
|         // dealing with ZERO separately
 | ||||
|         if (offset > -active && offset < 1)  | ||||
|             b[ -2*offset + 1 ] = ZERO; | ||||
| 
 | ||||
|         b += active * 2; | ||||
|         offset --; | ||||
|         vec_off = svsub_z(pg, vec_off, one_vec); | ||||
|         cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|         i--; | ||||
|     } | ||||
| 
 | ||||
|     posX += sve_size; | ||||
|     posX_vec = svdup_s64(posX); | ||||
|     j += sve_size; | ||||
|     pg = svwhilelt_b64(j, n); | ||||
|     active = svcntp_b64(svptrue_b64(), pg); | ||||
|   } while (svptest_any(svptrue_b64(), pg)); | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
|   int offset, i; | ||||
| 
 | ||||
|   lda *= 2; | ||||
| 
 | ||||
|   uint32_t sve_size = svcntw(); | ||||
|   svint32_t posY_vec = svdup_s32(posY); | ||||
|   svint32_t posX_vec = svdup_s32(posX); | ||||
|   svint32_t lda_vec = svdup_s32(lda); | ||||
|   svint32_t one_vec = svdup_s32(1); | ||||
| 
 | ||||
|   int32_t j = 0; | ||||
|   int32_t N = n; | ||||
|   svbool_t pg = svwhilelt_b32(j, N); | ||||
|   int32_t active = svcntp_b32(svptrue_b32(), pg); | ||||
|   svint32_t index_neg = svindex_s32(0, -1); | ||||
|   svint32_t index = svindex_s32(0, 1); | ||||
| 
 | ||||
|   do { | ||||
|     offset = posX - posY; | ||||
|     svint32_t vec_off = svdup_s32(offset); | ||||
|     svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|     svint32_t temp = svadd_z(pg, posX_vec, index); | ||||
|     svint32_t temp1 = svmul_z(pg, temp, 2); | ||||
|     temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | ||||
|     svint32_t temp2 = svmul_z(pg, temp, lda_vec); | ||||
|     temp2 = svmla_z(pg, temp2, posY_vec, 2); | ||||
|     svint32_t gat_ind = svsel(cmp, temp1, temp2); | ||||
| 
 | ||||
|     i = m; | ||||
|     while (i>0) { | ||||
|         svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | ||||
|         svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | ||||
| 
 | ||||
|         gat_ind = svadd_m(cmp, gat_ind, lda_vec); | ||||
|         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | ||||
|         if (offset <= 0) { | ||||
|             svbool_t off_g = svwhilelt_b32(offset, 0); | ||||
|             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | ||||
|         } | ||||
| 
 | ||||
|         svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | ||||
|         // dealing with ZERO separately
 | ||||
|         if (offset > -active && offset < 1)  | ||||
|             b[ -2*offset + 1 ] = ZERO; | ||||
| 
 | ||||
|         b += active * 2; | ||||
|         offset --; | ||||
|         vec_off = svsub_z(pg, vec_off, one_vec); | ||||
|         cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|         i--; | ||||
|     } | ||||
| 
 | ||||
|     posX += sve_size; | ||||
|     posX_vec = svdup_s32(posX); | ||||
|     j += sve_size; | ||||
|     pg = svwhilelt_b32(j, N); | ||||
|     active = svcntp_b32(svptrue_b32(), pg); | ||||
|   } while (svptest_any(svptrue_b32(), pg)); | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
|  | @ -0,0 +1,172 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include <arm_sve.h> | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | ||||
| 
 | ||||
| #if defined(DOUBLE) | ||||
|   BLASLONG offset, i; | ||||
| 
 | ||||
|   lda *= 2; | ||||
| 
 | ||||
|   uint64_t sve_size = svcntd(); | ||||
|   svint64_t posY_vec = svdup_s64(posY); | ||||
|   svint64_t posX_vec = svdup_s64(posX); | ||||
|   svint64_t lda_vec = svdup_s64(lda); | ||||
|   svint64_t one_vec = svdup_s64(1LL); | ||||
| 
 | ||||
|   int64_t j = 0; | ||||
|   svbool_t pg = svwhilelt_b64(j, n); | ||||
|   int64_t active = svcntp_b64(svptrue_b64(), pg); | ||||
|   svint64_t index_neg = svindex_s64(0LL, -1LL); | ||||
|   svint64_t index = svindex_s64(0LL, 1LL); | ||||
| 
 | ||||
|   do { | ||||
|     offset = posX - posY; | ||||
|     svint64_t vec_off = svdup_s64(offset); | ||||
|     svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|     svint64_t temp = svadd_z(pg, posX_vec, index); | ||||
|     svint64_t temp1 = svmul_z(pg, temp, lda); | ||||
|     temp1 = svmla_z(pg, temp1, posY_vec, 2); | ||||
|     svint64_t temp2 = svmul_z(pg, temp, 2); | ||||
|     temp2 = svmla_z(pg, temp2, posY_vec, lda); | ||||
|     svint64_t gat_ind = svsel(cmp, temp1, temp2); | ||||
| 
 | ||||
|     i = m; | ||||
|     while (i>0) { | ||||
|         svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | ||||
|         svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | ||||
| 
 | ||||
|         gat_ind = svadd_m(cmp, gat_ind, 2); | ||||
|         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | ||||
|         data_vec_imag = svneg_z(pg, data_vec_imag); | ||||
|         if (offset <= 0) { | ||||
|             svbool_t off_g = svwhilelt_b64(offset, 0LL); | ||||
|             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | ||||
|         } | ||||
| 
 | ||||
|         svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | ||||
|         // dealing with ZERO separately
 | ||||
|         if (offset > -active && offset < 1)  | ||||
|             b[ -2*offset + 1 ] = ZERO; | ||||
| 
 | ||||
|         b += active * 2; | ||||
|         offset --; | ||||
|         vec_off = svsub_z(pg, vec_off, one_vec); | ||||
|         cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|         i--; | ||||
|     } | ||||
| 
 | ||||
|     posX += sve_size; | ||||
|     posX_vec = svdup_s64(posX); | ||||
|     j += sve_size; | ||||
|     pg = svwhilelt_b64(j, n); | ||||
|     active = svcntp_b64(svptrue_b64(), pg); | ||||
|   } while (svptest_any(svptrue_b64(), pg)); | ||||
| #else | ||||
|   int offset, i; | ||||
| 
 | ||||
|   lda *= 2; | ||||
| 
 | ||||
|   uint32_t sve_size = svcntw(); | ||||
|   svint32_t posY_vec = svdup_s32(posY); | ||||
|   svint32_t posX_vec = svdup_s32(posX); | ||||
|   svint32_t lda_vec = svdup_s32(lda); | ||||
|   svint32_t one_vec = svdup_s32(1); | ||||
| 
 | ||||
|   int32_t j = 0; | ||||
|   int32_t N = n; | ||||
|   svbool_t pg = svwhilelt_b32(j, N); | ||||
|   int32_t active = svcntp_b32(svptrue_b32(), pg); | ||||
|   svint32_t index_neg = svindex_s32(0, -1); | ||||
|   svint32_t index = svindex_s32(0, 1); | ||||
| 
 | ||||
|   do { | ||||
|     offset = posX - posY; | ||||
|     svint32_t vec_off = svdup_s32(offset); | ||||
|     svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|     svint32_t temp = svadd_z(pg, posX_vec, index); | ||||
|     svint32_t temp1 = svmul_z(pg, temp, lda); | ||||
|     temp1 = svmla_z(pg, temp1, posY_vec, 2); | ||||
|     svint32_t temp2 = svmul_z(pg, temp, 2); | ||||
|     temp2 = svmla_z(pg, temp2, posY_vec, lda); | ||||
|     svint32_t gat_ind = svsel(cmp, temp1, temp2); | ||||
| 
 | ||||
|     i = m; | ||||
|     while (i>0) { | ||||
|         svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | ||||
|         svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | ||||
| 
 | ||||
|         gat_ind = svadd_m(cmp, gat_ind, 2); | ||||
|         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | ||||
|         data_vec_imag = svneg_z(pg, data_vec_imag); | ||||
|         if (offset <= 0) { | ||||
|             svbool_t off_g = svwhilelt_b32(offset, 0); | ||||
|             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | ||||
|         } | ||||
| 
 | ||||
|         svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | ||||
|         // dealing with ZERO separately
 | ||||
|         if (offset > -active && offset < 1)  | ||||
|             b[ -2*offset + 1 ] = ZERO; | ||||
| 
 | ||||
|         b += active * 2; | ||||
|         offset --; | ||||
|         vec_off = svsub_z(pg, vec_off, one_vec); | ||||
|         cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|         i--; | ||||
|     } | ||||
| 
 | ||||
|     posX += sve_size; | ||||
|     posX_vec = svdup_s32(posX); | ||||
|     j += sve_size; | ||||
|     pg = svwhilelt_b32(j, N); | ||||
|     active = svcntp_b32(svptrue_b32(), pg); | ||||
|   } while (svptest_any(svptrue_b32(), pg)); | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
|  | @ -0,0 +1,150 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include <arm_sve.h> | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, offset; | ||||
|   lda *= 2; | ||||
| 
 | ||||
| #if defined(DOUBLE) | ||||
|   uint64_t sve_size = svcntd(); | ||||
|   svint64_t posY_vec = svdup_s64(posY); | ||||
|   svint64_t posX_vec = svdup_s64(posX); | ||||
|   svint64_t lda_vec = svdup_s64(lda); | ||||
|   svint64_t one_vec = svdup_s64(1LL); | ||||
| 
 | ||||
|   int64_t j = 0; | ||||
|   svbool_t pg = svwhilelt_b64(j, n); | ||||
|   int64_t active = svcntp_b64(svptrue_b64(), pg); | ||||
|   svint64_t index_neg = svindex_s64(0LL, -1LL); | ||||
|   svint64_t index = svindex_s64(0LL, 1LL); | ||||
|   do { | ||||
|     offset = posX - posY; | ||||
|     svint64_t vec_off = svdup_s64(offset); | ||||
|     svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|     svint64_t temp = svadd_z(pg, posX_vec, index); | ||||
|     svint64_t temp1 = svmul_z(pg, temp, 2); | ||||
|     temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | ||||
|     svint64_t temp2 = svmul_z(pg, temp, lda_vec); | ||||
|     temp2 = svmla_z(pg, temp2, posY_vec, 2); | ||||
|     svint64_t gat_ind = svsel(cmp, temp1, temp2); | ||||
| 
 | ||||
|     i = m; | ||||
|     while (i>0) { | ||||
|         svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | ||||
|         svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | ||||
| 
 | ||||
|         gat_ind = svadd_m(cmp, gat_ind, lda_vec); | ||||
|         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | ||||
| 
 | ||||
|         svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | ||||
| 
 | ||||
|         b += active * 2; | ||||
|         offset --; | ||||
|         vec_off = svsub_z(pg, vec_off, one_vec); | ||||
|         cmp = svcmpgt(pg, vec_off, index_neg); | ||||
|          | ||||
|         i--; | ||||
|     } | ||||
| 
 | ||||
|     posX += sve_size; | ||||
|     posX_vec = svdup_s64(posX); | ||||
|     j += sve_size; | ||||
|     pg = svwhilelt_b64(j, n); | ||||
|     active = svcntp_b64(svptrue_b64(), pg); | ||||
|   } while (svptest_any(svptrue_b64(), pg)); | ||||
| 
 | ||||
| #else | ||||
|   uint32_t sve_size = svcntw(); | ||||
|   svint32_t posY_vec = svdup_s32(posY); | ||||
|   svint32_t posX_vec = svdup_s32(posX); | ||||
|   svint32_t lda_vec = svdup_s32(lda); | ||||
|   svint32_t one_vec = svdup_s32(1); | ||||
| 
 | ||||
|   int32_t N = n; | ||||
|   int32_t j = 0; | ||||
|   svbool_t pg = svwhilelt_b32(j, N); | ||||
|   int32_t active = svcntp_b32(svptrue_b32(), pg); | ||||
|   svint32_t index_neg = svindex_s32(0, -1); | ||||
|   svint32_t index = svindex_s32(0, 1); | ||||
|   do { | ||||
|     offset = posX - posY; | ||||
|     svint32_t vec_off = svdup_s32(offset); | ||||
|     svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|     svint32_t temp = svadd_z(pg, posX_vec, index); | ||||
|     svint32_t temp1 = svmul_z(pg, temp, 2); | ||||
|     temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | ||||
|     svint32_t temp2 = svmul_z(pg, temp, lda_vec); | ||||
|     temp2 = svmla_z(pg, temp2, posY_vec, 2); | ||||
|     svint32_t gat_ind = svsel(cmp, temp1, temp2); | ||||
| 
 | ||||
|     i = m; | ||||
|     while (i>0) { | ||||
|         svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | ||||
|         svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | ||||
| 
 | ||||
|         gat_ind = svadd_m(cmp, gat_ind, lda_vec); | ||||
|         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | ||||
| 
 | ||||
|         svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | ||||
| 
 | ||||
|         b += active * 2; | ||||
|         offset --; | ||||
|         vec_off = svsub_z(pg, vec_off, one_vec); | ||||
|         cmp = svcmpgt(pg, vec_off, index_neg); | ||||
|          | ||||
|         i--; | ||||
|     } | ||||
| 
 | ||||
|     posX += sve_size; | ||||
|     posX_vec = svdup_s32(posX); | ||||
|     j += sve_size; | ||||
|     pg = svwhilelt_b32(j, N); | ||||
|     active = svcntp_b32(svptrue_b32(), pg); | ||||
|   } while (svptest_any(svptrue_b32(), pg)); | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
|  | @ -0,0 +1,150 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include <arm_sve.h> | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, offset; | ||||
|   lda *= 2; | ||||
| 
 | ||||
| #if defined(DOUBLE) | ||||
|   uint64_t sve_size = svcntd(); | ||||
|   svint64_t posY_vec = svdup_s64(posY); | ||||
|   svint64_t posX_vec = svdup_s64(posX); | ||||
|   svint64_t lda_vec = svdup_s64(lda); | ||||
|   svint64_t one_vec = svdup_s64(1LL); | ||||
| 
 | ||||
|   int64_t j = 0; | ||||
|   svbool_t pg = svwhilelt_b64(j, n); | ||||
|   int64_t active = svcntp_b64(svptrue_b64(), pg); | ||||
|   svint64_t index_neg = svindex_s64(0LL, -1LL); | ||||
|   svint64_t index = svindex_s64(0LL, 1LL); | ||||
|   do { | ||||
|     offset = posX - posY; | ||||
|     svint64_t vec_off = svdup_s64(offset); | ||||
|     svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|     svint64_t temp = svadd_z(pg, posX_vec, index); | ||||
|     svint64_t temp1 = svmul_z(pg, temp, lda_vec); | ||||
|     temp1 = svmla_z(pg, temp1, posY_vec, 2); | ||||
|     svint64_t temp2 = svmul_z(pg, temp, 2); | ||||
|     temp2 = svmla_z(pg, temp2, posY_vec, lda); | ||||
|     svint64_t gat_ind = svsel(cmp, temp1, temp2); | ||||
| 
 | ||||
|     i = m; | ||||
|     while (i>0) { | ||||
|         svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | ||||
|         svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | ||||
| 
 | ||||
|         gat_ind = svadd_m(cmp, gat_ind, 2); | ||||
|         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | ||||
| 
 | ||||
|         svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | ||||
| 
 | ||||
|         b += active * 2; | ||||
|         offset --; | ||||
|         vec_off = svsub_z(pg, vec_off, one_vec); | ||||
|         cmp = svcmpgt(pg, vec_off, index_neg); | ||||
|          | ||||
|         i--; | ||||
|     } | ||||
| 
 | ||||
|     posX += sve_size; | ||||
|     posX_vec = svdup_s64(posX); | ||||
|     j += sve_size; | ||||
|     pg = svwhilelt_b64(j, n); | ||||
|     active = svcntp_b64(svptrue_b64(), pg); | ||||
|   } while (svptest_any(svptrue_b64(), pg)); | ||||
| 
 | ||||
| #else | ||||
|   uint32_t sve_size = svcntw(); | ||||
|   svint32_t posY_vec = svdup_s32(posY); | ||||
|   svint32_t posX_vec = svdup_s32(posX); | ||||
|   svint32_t lda_vec = svdup_s32(lda); | ||||
|   svint32_t one_vec = svdup_s32(1); | ||||
| 
 | ||||
|   int32_t N = n; | ||||
|   int32_t j = 0; | ||||
|   svbool_t pg = svwhilelt_b32(j, N); | ||||
|   int32_t active = svcntp_b32(svptrue_b32(), pg); | ||||
|   svint32_t index_neg = svindex_s32(0, -1); | ||||
|   svint32_t index = svindex_s32(0, 1); | ||||
|   do { | ||||
|     offset = posX - posY; | ||||
|     svint32_t vec_off = svdup_s32(offset); | ||||
|     svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | ||||
| 
 | ||||
|     svint32_t temp = svadd_z(pg, posX_vec, index); | ||||
|     svint32_t temp1 = svmul_z(pg, temp, lda_vec); | ||||
|     temp1 = svmla_z(pg, temp1, posY_vec, 2); | ||||
|     svint32_t temp2 = svmul_z(pg, temp, 2); | ||||
|     temp2 = svmla_z(pg, temp2, posY_vec, lda); | ||||
|     svint32_t gat_ind = svsel(cmp, temp1, temp2); | ||||
| 
 | ||||
|     i = m; | ||||
|     while (i>0) { | ||||
|         svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | ||||
|         svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | ||||
| 
 | ||||
|         gat_ind = svadd_m(cmp, gat_ind, 2); | ||||
|         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | ||||
| 
 | ||||
|         svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | ||||
| 
 | ||||
|         b += active * 2; | ||||
|         offset --; | ||||
|         vec_off = svsub_z(pg, vec_off, one_vec); | ||||
|         cmp = svcmpgt(pg, vec_off, index_neg); | ||||
|          | ||||
|         i--; | ||||
|     } | ||||
| 
 | ||||
|     posX += sve_size; | ||||
|     posX_vec = svdup_s32(posX); | ||||
|     j += sve_size; | ||||
|     pg = svwhilelt_b32(j, N); | ||||
|     active = svcntp_b32(svptrue_b32(), pg); | ||||
|   } while (svptest_any(svptrue_b32(), pg)); | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -0,0 +1,145 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| 
 | ||||
| #ifdef __ARM_FEATURE_SVE | ||||
| #include <arm_sve.h> | ||||
| #endif | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | ||||
| 
 | ||||
|     BLASLONG i, js; | ||||
|     BLASLONG X; | ||||
| 
 | ||||
|     lda += lda; | ||||
| 
 | ||||
|     js = 0; | ||||
|     FLOAT *ao; | ||||
| #ifdef DOUBLE | ||||
|     svint64_t index = svindex_s64(0LL, lda); | ||||
|     svbool_t pn = svwhilelt_b64(js, n); | ||||
|     int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|     svint32_t index = svindex_s32(0, lda); | ||||
|     svbool_t pn = svwhilelt_b32(js, n); | ||||
|     int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|     do | ||||
|     { | ||||
|         X = posX; | ||||
| 
 | ||||
|         if (posX <= posY) { | ||||
|             ao = a + posY * 2 + posX * lda; | ||||
|         } else { | ||||
|             ao = a + posX * 2 + posY * lda; | ||||
|         } | ||||
| 
 | ||||
|         i = 0; | ||||
|         do  | ||||
|         { | ||||
|             if (X > posY) { | ||||
| #ifdef DOUBLE | ||||
|                 svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | ||||
|                 svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | ||||
| #else | ||||
|                 svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | ||||
|                 svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | ||||
| #endif | ||||
|                 svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | ||||
|                 ao += 2; | ||||
|                 b += n_active * 2; | ||||
|                 X ++; | ||||
|                 i ++; | ||||
|             } else  | ||||
|                 if (X < posY) { | ||||
|                     ao += lda; | ||||
|                     b += n_active * 2; | ||||
|                     X ++; | ||||
|                     i ++; | ||||
|                 } else { | ||||
|                     /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | ||||
| #ifdef UNIT | ||||
|                     int temp = 0; | ||||
|                     for (int j = 0; j < n_active; j++) { | ||||
|                         for (int k = 0 ; k < j; k++) { | ||||
|                             b[temp++] = *(ao+k*lda+j*2); | ||||
|                             b[temp++] = *(ao+k*lda+j*2+1); | ||||
|                         } | ||||
|                         b[temp++] = ONE; | ||||
|                         b[temp++] = ZERO; | ||||
|                         for (int k = j+1; k < n_active; k++) { | ||||
|                             b[temp++] = ZERO; | ||||
|                             b[temp++] = ZERO; | ||||
|                         } | ||||
|                     } | ||||
| #else  | ||||
|                     int temp = 0; | ||||
|                     for (int j = 0; j < n_active; j++) { | ||||
|                         for (int k = 0 ; k <= j; k++) { | ||||
|                             b[temp++] = *(ao+k*lda+j*2); | ||||
|                             b[temp++] = *(ao+k*lda+j*2+1); | ||||
|                         } | ||||
|                         for (int k = j+1; k < n_active; k++) { | ||||
|                             b[temp++] = ZERO; | ||||
|                             b[temp++] = ZERO; | ||||
|                         } | ||||
|                     } | ||||
| #endif | ||||
|                     ao += n_active * 2; | ||||
|                     b += n_active*n_active * 2; | ||||
|                     X += n_active; | ||||
|                     i += n_active; | ||||
|                 } | ||||
|         } while (i < m); | ||||
| 
 | ||||
|         posY += n_active; | ||||
|         js += n_active; | ||||
| #ifdef DOUBLE | ||||
|         pn = svwhilelt_b64(js, n); | ||||
|         n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|     } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|         pn = svwhilelt_b32(js, n); | ||||
|         n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|     } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
|  | @ -0,0 +1,143 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| 
 | ||||
| #ifdef __ARM_FEATURE_SVE | ||||
| #include <arm_sve.h> | ||||
| #endif | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | ||||
| 
 | ||||
|     BLASLONG i, js; | ||||
|     BLASLONG X; | ||||
| 
 | ||||
|     lda += lda; | ||||
| 
 | ||||
|     FLOAT *ao; | ||||
|     js = 0; | ||||
| #ifdef DOUBLE | ||||
|     svbool_t pn = svwhilelt_b64(js, n); | ||||
|     int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|     svbool_t pn = svwhilelt_b32(js, n); | ||||
|     int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|     do | ||||
|     { | ||||
|         X = posX; | ||||
| 
 | ||||
|         if (posX <= posY) { | ||||
|             ao = a + posY * 2 + posX * lda; | ||||
|         } else { | ||||
|             ao = a + posX * 2 + posY * lda; | ||||
|         } | ||||
| 
 | ||||
|         i = 0; | ||||
|         do  | ||||
|         { | ||||
|             if (X > posY) { | ||||
|                 ao += 2; | ||||
|                 b += n_active * 2; | ||||
|                 X ++; | ||||
|                 i ++; | ||||
|             } else  | ||||
|                 if (X < posY) { | ||||
| #ifdef DOUBLE | ||||
|                     svfloat64x2_t aj_vec = svld2(pn, ao); | ||||
| #else | ||||
|                     svfloat32x2_t aj_vec = svld2(pn, ao); | ||||
| #endif | ||||
|                     svst2(pn, b, aj_vec); | ||||
|                     ao += lda; | ||||
|                     b += n_active * 2; | ||||
|                     X ++; | ||||
|                     i ++; | ||||
|                 } else { | ||||
|                     /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | ||||
| #ifdef UNIT | ||||
|                     int temp = 0; | ||||
|                     for (int j = 0; j < n_active; j++) { | ||||
|                         for (int k = 0 ; k < j; k++) { | ||||
|                             b[temp++] = ZERO; | ||||
|                             b[temp++] = ZERO; | ||||
|                         } | ||||
|                         b[temp++] = ONE; | ||||
|                         b[temp++] = ZERO; | ||||
|                         for (int k = j+1; k < n_active; k++) { | ||||
|                             b[temp++] = *(ao+j*lda+k*2); | ||||
|                             b[temp++] = *(ao+j*lda+k*2+1); | ||||
|                         } | ||||
|                     } | ||||
| #else  | ||||
|                     int temp = 0; | ||||
|                     for (int j = 0; j < n_active; j++) { | ||||
|                         for (int k = 0 ; k < j; k++) { | ||||
|                             b[temp++] = ZERO; | ||||
|                             b[temp++] = ZERO; | ||||
|                         } | ||||
|                         for (int k = j; k < n_active; k++) { | ||||
|                             b[temp++] = *(ao+j*lda+k*2); | ||||
|                             b[temp++] = *(ao+j*lda+k*2+1); | ||||
|                         } | ||||
|                     } | ||||
| #endif | ||||
|                     ao += n_active * lda; | ||||
|                     b += n_active*n_active * 2; | ||||
|                     X += n_active; | ||||
|                     i += n_active; | ||||
|                 } | ||||
|         } while (i < m); | ||||
| 
 | ||||
| 
 | ||||
|         posY += n_active; | ||||
|         js += n_active; | ||||
| #ifdef DOUBLE | ||||
|         pn = svwhilelt_b64(js, n); | ||||
|         n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|     } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|         pn = svwhilelt_b32(js, n); | ||||
|         n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|     } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
|  | @ -0,0 +1,145 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| 
 | ||||
| #ifdef __ARM_FEATURE_SVE | ||||
| #include <arm_sve.h> | ||||
| #endif | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | ||||
| 
 | ||||
|     BLASLONG i, js; | ||||
|     BLASLONG X; | ||||
| 
 | ||||
|     lda += lda; | ||||
| 
 | ||||
|     js = 0; | ||||
|     FLOAT *ao; | ||||
| #ifdef DOUBLE | ||||
|     svint64_t index = svindex_s64(0LL, lda); | ||||
|     svbool_t pn = svwhilelt_b64(js, n); | ||||
|     int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|     svint32_t index = svindex_s32(0, lda); | ||||
|     svbool_t pn = svwhilelt_b32(js, n); | ||||
|     int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|     do | ||||
|     { | ||||
|         X = posX; | ||||
| 
 | ||||
|         if (posX <= posY) { | ||||
|             ao = a + posX * 2 + posY * lda; | ||||
|         } else { | ||||
|             ao = a + posY * 2 + posX * lda; | ||||
|         } | ||||
| 
 | ||||
|         i = 0; | ||||
|         do  | ||||
|         { | ||||
|             if (X < posY) { | ||||
| #ifdef DOUBLE | ||||
|                 svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | ||||
|                 svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | ||||
| #else | ||||
|                 svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | ||||
|                 svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | ||||
| #endif | ||||
|                 svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | ||||
|                 ao += 2; | ||||
|                 b += n_active * 2; | ||||
|                 X ++; | ||||
|                 i ++; | ||||
|             } else  | ||||
|                 if (X > posY) { | ||||
|                     ao += lda; | ||||
|                     b += n_active * 2; | ||||
|                     X ++; | ||||
|                     i ++; | ||||
|                 } else { | ||||
|                     /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | ||||
| #ifdef UNIT | ||||
|                     int temp = 0; | ||||
|                     for (int j = 0; j < n_active; j++) { | ||||
|                         for (int k = 0 ; k < j; k++) { | ||||
|                             b[temp++] = ZERO; | ||||
|                             b[temp++] = ZERO; | ||||
|                         } | ||||
|                         b[temp++] = ONE; | ||||
|                         b[temp++] = ZERO; | ||||
|                         for (int k = j+1; k < n_active; k++) { | ||||
|                             b[temp++] = *(ao+k*lda+j*2); | ||||
|                             b[temp++] = *(ao+k*lda+j*2+1); | ||||
|                         } | ||||
|                     } | ||||
| #else  | ||||
|                     int temp = 0; | ||||
|                     for (int j = 0; j < n_active; j++) { | ||||
|                         for (int k = 0 ; k < j; k++) { | ||||
|                             b[temp++] = ZERO; | ||||
|                             b[temp++] = ZERO; | ||||
|                         } | ||||
|                         for (int k = j; k < n_active; k++) { | ||||
|                             b[temp++] = *(ao+k*lda+j*2); | ||||
|                             b[temp++] = *(ao+k*lda+j*2+1); | ||||
|                         } | ||||
|                     } | ||||
| #endif | ||||
|                     ao += n_active * 2; | ||||
|                     b += n_active*n_active * 2; | ||||
|                     X += n_active; | ||||
|                     i += n_active; | ||||
|                 } | ||||
|         } while (i < m); | ||||
| 
 | ||||
|         posY += n_active; | ||||
|         js += n_active; | ||||
| #ifdef DOUBLE | ||||
|         pn = svwhilelt_b64(js, n); | ||||
|         n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|     } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|         pn = svwhilelt_b32(js, n); | ||||
|         n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|     } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
|  | @ -0,0 +1,141 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| 
 | ||||
| #ifdef __ARM_FEATURE_SVE | ||||
| #include <arm_sve.h> | ||||
| #endif | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | ||||
| 
 | ||||
|     BLASLONG i, js; | ||||
|     BLASLONG X; | ||||
| 
 | ||||
|     lda += lda; | ||||
| 
 | ||||
|     FLOAT *ao; | ||||
|     js = 0; | ||||
| #ifdef DOUBLE | ||||
|     svbool_t pn = svwhilelt_b64(js, n); | ||||
|     int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|     svbool_t pn = svwhilelt_b32(js, n); | ||||
|     int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|     do | ||||
|     { | ||||
|         X = posX; | ||||
| 
 | ||||
|         if (posX <= posY) { | ||||
|             ao = a + posX * 2 + posY * lda; | ||||
|         } else { | ||||
|             ao = a + posY * 2 + posX * lda; | ||||
|         } | ||||
| 
 | ||||
|         i = 0; | ||||
|         do  | ||||
|         { | ||||
|             if (X < posY) { | ||||
|                 ao += 2; | ||||
|                 b += n_active * 2; | ||||
|                 X ++; | ||||
|                 i ++; | ||||
|             } else  | ||||
|                 if (X > posY) { | ||||
| #ifdef DOUBLE | ||||
|                     svfloat64x2_t aj_vec = svld2(pn, ao); | ||||
| #else | ||||
|                     svfloat32x2_t aj_vec = svld2(pn, ao); | ||||
| #endif | ||||
|                     svst2(pn, b, aj_vec); | ||||
|                     ao += lda; | ||||
|                     b += n_active * 2; | ||||
|                     X ++; | ||||
|                     i ++; | ||||
|                 } else {  | ||||
|                     /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | ||||
| #ifdef UNIT | ||||
|                     int temp = 0; | ||||
|                     for (int j = 0; j < n_active; j++) { | ||||
|                         for (int k = 0 ; k < j; k++) { | ||||
|                             b[temp++] = *(ao+j*lda+k*2); | ||||
|                             b[temp++] = *(ao+j*lda+k*2+1); | ||||
|                         } | ||||
|                         b[temp++] = ONE; | ||||
|                         b[temp++] = ZERO; | ||||
|                         for (int k = j+1; k < n_active; k++) { | ||||
|                             b[temp++] = ZERO; | ||||
|                             b[temp++] = ZERO; | ||||
|                         } | ||||
|                     } | ||||
| #else  | ||||
|                     int temp = 0; | ||||
|                     for (int j = 0; j < n_active; j++) { | ||||
|                         for (int k = 0 ; k <= j; k++) { | ||||
|                             b[temp++] = *(ao+j*lda+k*2); | ||||
|                             b[temp++] = *(ao+j*lda+k*2+1); | ||||
|                         } | ||||
|                         for (int k = j+1; k < n_active; k++) { | ||||
|                             b[temp++] = ZERO; | ||||
|                             b[temp++] = ZERO; | ||||
|                         } | ||||
|                     } | ||||
| #endif | ||||
|                     ao += n_active * lda; | ||||
|                     b += n_active*n_active * 2; | ||||
|                     X += n_active; | ||||
|                     i += n_active; | ||||
|                 } | ||||
|         } while (i < m); | ||||
| 
 | ||||
|         posY += n_active; | ||||
|         js += n_active; | ||||
| #ifdef DOUBLE | ||||
|         pn = svwhilelt_b64(js, n); | ||||
|         n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|     } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|         pn = svwhilelt_b32(js, n); | ||||
|         n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|     } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
|  | @ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, ii, jj; | ||||
| 
 | ||||
|   FLOAT *ao; | ||||
| 
 | ||||
|   lda *= 2; | ||||
| 
 | ||||
|   jj = offset; | ||||
| #ifdef DOUBLE | ||||
|   int64_t js = 0; | ||||
|   svint64_t index = svindex_s64(0LL, lda); | ||||
|   svbool_t pn = svwhilelt_b64(js, n); | ||||
|   int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|   int32_t N = n; | ||||
|   int32_t js = 0; | ||||
|   svint32_t index = svindex_s32(0, lda); | ||||
|   svbool_t pn = svwhilelt_b32(js, N); | ||||
|   int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|   do { | ||||
| 
 | ||||
|     ao = a; | ||||
| 
 | ||||
|     i = 0; | ||||
|     ii = 0; | ||||
|     do { | ||||
| 
 | ||||
|       if (ii == jj) { | ||||
|         for (int j = 0; j < n_active; j++) { | ||||
|           for (int k = 0; k < j; k++) { | ||||
|             *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | ||||
|             *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | ||||
|           } | ||||
|           compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | ||||
|           //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
 | ||||
|         } | ||||
|         ao += n_active * 2; | ||||
|         b += n_active * n_active * 2; | ||||
|         i += n_active; | ||||
|         ii += n_active; | ||||
|       } else { | ||||
|         if (ii > jj) { | ||||
| #ifdef DOUBLE | ||||
|           svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | ||||
|           svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | ||||
| #else | ||||
|           svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | ||||
|           svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | ||||
| #endif | ||||
|           svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | ||||
|         } | ||||
|         ao += 2; | ||||
|         b += n_active * 2; | ||||
|         i++; | ||||
|         ii++; | ||||
|       } | ||||
|     } while (i < m); | ||||
| 
 | ||||
| 
 | ||||
|     a += n_active * lda; | ||||
|     jj += n_active; | ||||
| 
 | ||||
|     js += n_active; | ||||
| #ifdef DOUBLE | ||||
|     pn = svwhilelt_b64(js, n); | ||||
|     n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|   } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|     pn = svwhilelt_b32(js, N); | ||||
|     n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|   } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
| return 0; | ||||
| } | ||||
|  | @ -0,0 +1,115 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, ii, jj; | ||||
| 
 | ||||
|   FLOAT *ao; | ||||
| 
 | ||||
|   lda *= 2; | ||||
| 
 | ||||
|   jj = offset; | ||||
| #ifdef DOUBLE | ||||
|   int64_t js = 0; | ||||
|   svbool_t pn = svwhilelt_b64(js, n); | ||||
|   int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|   int32_t N = n; | ||||
|   int32_t js = 0; | ||||
|   svbool_t pn = svwhilelt_b32(js, N); | ||||
|   int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|   do { | ||||
| 
 | ||||
|     ao = a; | ||||
| 
 | ||||
|     i = 0; | ||||
|     ii = 0; | ||||
|     do { | ||||
| 
 | ||||
|       if (ii == jj) { | ||||
|         for (int j = 0; j < n_active; j++) { | ||||
|           compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | ||||
|           //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
 | ||||
|           for (int k = j+1; k < n_active; k++) { | ||||
|             *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | ||||
|             *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | ||||
|           } | ||||
|         } | ||||
|         b += n_active * n_active * 2; | ||||
|         ao += lda * n_active; | ||||
|         i += n_active; | ||||
|         ii += n_active; | ||||
|       } else { | ||||
|         if (ii < jj) { | ||||
| #ifdef DOUBLE | ||||
|           svfloat64x2_t aj_vec = svld2(pn, ao); | ||||
| #else | ||||
|           svfloat32x2_t aj_vec = svld2(pn, ao); | ||||
| #endif | ||||
|           svst2(pn, b, aj_vec); | ||||
|         } | ||||
|         ao += lda; | ||||
|         b += n_active * 2; | ||||
|         i ++; | ||||
|         ii ++; | ||||
|       } | ||||
|     } while (i < m); | ||||
| 
 | ||||
| 
 | ||||
|     a += n_active * 2; | ||||
|     jj += n_active; | ||||
| 
 | ||||
|     js += n_active; | ||||
| #ifdef DOUBLE | ||||
|     pn = svwhilelt_b64(js, n); | ||||
|     n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|   } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|     pn = svwhilelt_b32(js, N); | ||||
|     n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|   } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
| return 0; | ||||
| } | ||||
|  | @ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, ii, jj; | ||||
| 
 | ||||
|   FLOAT *ao; | ||||
| 
 | ||||
|   lda *= 2; | ||||
| 
 | ||||
|   jj = offset; | ||||
| #ifdef DOUBLE | ||||
|   int64_t js = 0; | ||||
|   svint64_t index = svindex_s64(0LL, lda); | ||||
|   svbool_t pn = svwhilelt_b64(js, n); | ||||
|   int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|   int32_t N = n; | ||||
|   int32_t js = 0; | ||||
|   svint32_t index = svindex_s32(0, lda); | ||||
|   svbool_t pn = svwhilelt_b32(js, N); | ||||
|   int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|   do { | ||||
| 
 | ||||
|     ao = a; | ||||
| 
 | ||||
|     i = 0; | ||||
|     ii = 0; | ||||
|     do { | ||||
| 
 | ||||
|       if (ii == jj) { | ||||
|         for (int j = 0; j < n_active; j++) { | ||||
|           compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | ||||
|           //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
 | ||||
|           for (int k = j+1; k < n_active; k++) { | ||||
|             *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | ||||
|             *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | ||||
|           } | ||||
|         } | ||||
|         ao += n_active * 2; | ||||
|         b += n_active * n_active * 2; | ||||
|         i += n_active; | ||||
|         ii += n_active; | ||||
|       } else { | ||||
|         if (ii < jj) { | ||||
| #ifdef DOUBLE | ||||
|           svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | ||||
|           svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | ||||
| #else | ||||
|           svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | ||||
|           svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | ||||
| #endif | ||||
|           svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | ||||
|         } | ||||
|         ao += 2; | ||||
|         b += n_active * 2; | ||||
|         i++; | ||||
|         ii++; | ||||
|       } | ||||
|     } while (i < m); | ||||
| 
 | ||||
| 
 | ||||
|     a += n_active * lda; | ||||
|     jj += n_active; | ||||
| 
 | ||||
|     js += n_active; | ||||
| #ifdef DOUBLE | ||||
|     pn = svwhilelt_b64(js, n); | ||||
|     n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|   } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|     pn = svwhilelt_b32(js, N); | ||||
|     n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|   } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
| return 0; | ||||
| } | ||||
|  | @ -0,0 +1,115 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
| /* Redistribution and use in source and binary forms, with or        */ | ||||
| /* without modification, are permitted provided that the following   */ | ||||
| /* conditions are met:                                               */ | ||||
| /*                                                                   */ | ||||
| /*   1. Redistributions of source code must retain the above         */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer.                                                  */ | ||||
| /*                                                                   */ | ||||
| /*   2. Redistributions in binary form must reproduce the above      */ | ||||
| /*      copyright notice, this list of conditions and the following  */ | ||||
| /*      disclaimer in the documentation and/or other materials       */ | ||||
| /*      provided with the distribution.                              */ | ||||
| /*                                                                   */ | ||||
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | ||||
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | ||||
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | ||||
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | ||||
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | ||||
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | ||||
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | ||||
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | ||||
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | ||||
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | ||||
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | ||||
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | ||||
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | ||||
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | ||||
| /*                                                                   */ | ||||
| /* The views and conclusions contained in the software and           */ | ||||
| /* documentation are those of the authors and should not be          */ | ||||
| /* interpreted as representing official policies, either expressed   */ | ||||
| /* or implied, of The University of Texas at Austin.                 */ | ||||
| /*********************************************************************/ | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| #include "common.h" | ||||
| #include "arm_sve.h" | ||||
| 
 | ||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | ||||
| 
 | ||||
|   BLASLONG i, ii, jj; | ||||
| 
 | ||||
|   FLOAT *ao; | ||||
| 
 | ||||
|   lda *= 2; | ||||
| 
 | ||||
|   jj = offset; | ||||
| #ifdef DOUBLE | ||||
|   int64_t js = 0; | ||||
|   svbool_t pn = svwhilelt_b64(js, n); | ||||
|   int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | ||||
|   int32_t N = n; | ||||
|   int32_t js = 0; | ||||
|   svbool_t pn = svwhilelt_b32(js, N); | ||||
|   int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | ||||
|   do { | ||||
| 
 | ||||
|     ao = a; | ||||
| 
 | ||||
|     i = 0; | ||||
|     ii = 0; | ||||
|     do { | ||||
| 
 | ||||
|       if (ii == jj) { | ||||
|         for (int j = 0; j < n_active; j++) { | ||||
|           for (int k = 0; k < j; k++) { | ||||
|             *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | ||||
|             *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | ||||
|           } | ||||
|           compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | ||||
|           //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
 | ||||
|         } | ||||
|         ao += lda * n_active; | ||||
|         b += n_active * n_active * 2; | ||||
|         i += n_active; | ||||
|         ii += n_active; | ||||
|       } else { | ||||
|         if (ii > jj) { | ||||
| #ifdef DOUBLE | ||||
|           svfloat64x2_t aj_vec = svld2(pn, ao); | ||||
| #else | ||||
|           svfloat32x2_t aj_vec = svld2(pn, ao); | ||||
| #endif | ||||
|           svst2(pn, b, aj_vec); | ||||
|         } | ||||
|         ao += lda; | ||||
|         b += n_active * 2; | ||||
|         i ++; | ||||
|         ii ++; | ||||
|       }  | ||||
|     } while (i < m); | ||||
| 
 | ||||
| 
 | ||||
|     a += n_active * 2; | ||||
|     jj += n_active; | ||||
| 
 | ||||
|     js += n_active; | ||||
| #ifdef DOUBLE | ||||
|     pn = svwhilelt_b64(js, n); | ||||
|     n_active = svcntp_b64(svptrue_b64(), pn); | ||||
|   } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | ||||
|     pn = svwhilelt_b32(js, N); | ||||
|     n_active = svcntp_b32(svptrue_b32(), pn); | ||||
|   } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | ||||
| 
 | ||||
| return 0; | ||||
| } | ||||
|  | @ -0,0 +1,149 @@ | |||
| SAMAXKERNEL  = ../arm/amax.c | ||||
| DAMAXKERNEL  = ../arm/amax.c | ||||
| CAMAXKERNEL  = ../arm/zamax.c | ||||
| ZAMAXKERNEL  = ../arm/zamax.c | ||||
| 
 | ||||
| SAMINKERNEL  = ../arm/amin.c | ||||
| DAMINKERNEL  = ../arm/amin.c | ||||
| CAMINKERNEL  = ../arm/zamin.c | ||||
| ZAMINKERNEL  = ../arm/zamin.c | ||||
| 
 | ||||
| SMAXKERNEL   = ../arm/max.c | ||||
| DMAXKERNEL   = ../arm/max.c | ||||
| 
 | ||||
| SMINKERNEL   = ../arm/min.c | ||||
| DMINKERNEL   = ../arm/min.c | ||||
| 
 | ||||
| ISAMAXKERNEL = ../arm/iamax.c | ||||
| IDAMAXKERNEL = ../arm/iamax.c | ||||
| ICAMAXKERNEL = ../arm/izamax.c | ||||
| IZAMAXKERNEL = ../arm/izamax.c | ||||
| 
 | ||||
| ISAMINKERNEL = ../arm/iamin.c | ||||
| IDAMINKERNEL = ../arm/iamin.c | ||||
| ICAMINKERNEL = ../arm/izamin.c | ||||
| IZAMINKERNEL = ../arm/izamin.c | ||||
| 
 | ||||
| ISMAXKERNEL  = ../arm/imax.c | ||||
| IDMAXKERNEL  = ../arm/imax.c | ||||
| 
 | ||||
| ISMINKERNEL  = ../arm/imin.c | ||||
| IDMINKERNEL  = ../arm/imin.c | ||||
| 
 | ||||
| SASUMKERNEL  = ../arm/asum.c | ||||
| DASUMKERNEL  = ../arm/asum.c | ||||
| CASUMKERNEL  = ../arm/zasum.c | ||||
| ZASUMKERNEL  = ../arm/zasum.c | ||||
| 
 | ||||
| SSUMKERNEL  = ../arm/sum.c | ||||
| DSUMKERNEL  = ../arm/sum.c | ||||
| CSUMKERNEL  = ../arm/zsum.c | ||||
| ZSUMKERNEL  = ../arm/zsum.c | ||||
| 
 | ||||
| SAXPYKERNEL  = ../arm/axpy.c | ||||
| DAXPYKERNEL  = ../arm/axpy.c | ||||
| CAXPYKERNEL  = ../arm/zaxpy.c | ||||
| ZAXPYKERNEL  = ../arm/zaxpy.c | ||||
| 
 | ||||
| SCOPYKERNEL  = ../arm/copy.c | ||||
| DCOPYKERNEL  = ../arm/copy.c | ||||
| CCOPYKERNEL  = ../arm/zcopy.c | ||||
| ZCOPYKERNEL  = ../arm/zcopy.c | ||||
| 
 | ||||
| SDOTKERNEL   = ../arm/dot.c | ||||
| DDOTKERNEL   = ../arm/dot.c | ||||
| CDOTKERNEL   = ../arm/zdot.c | ||||
| ZDOTKERNEL   = ../arm/zdot.c | ||||
| DSDOTKERNEL  = ../generic/dot.c | ||||
| 
 | ||||
| SNRM2KERNEL  = ../arm/nrm2.c | ||||
| DNRM2KERNEL  = ../arm/nrm2.c | ||||
| CNRM2KERNEL  = ../arm/znrm2.c | ||||
| ZNRM2KERNEL  = ../arm/znrm2.c | ||||
| 
 | ||||
| SROTKERNEL   = ../arm/rot.c | ||||
| DROTKERNEL   = ../arm/rot.c | ||||
| CROTKERNEL   = ../arm/zrot.c | ||||
| ZROTKERNEL   = ../arm/zrot.c | ||||
| 
 | ||||
| SSCALKERNEL  = ../arm/scal.c | ||||
| DSCALKERNEL  = ../arm/scal.c | ||||
| CSCALKERNEL  = ../arm/zscal.c | ||||
| ZSCALKERNEL  = ../arm/zscal.c | ||||
| 
 | ||||
| SSWAPKERNEL  = ../arm/swap.c | ||||
| DSWAPKERNEL  = ../arm/swap.c | ||||
| CSWAPKERNEL  = ../arm/zswap.c | ||||
| ZSWAPKERNEL  = ../arm/zswap.c | ||||
| 
 | ||||
| SGEMVNKERNEL = ../arm/gemv_n.c | ||||
| DGEMVNKERNEL = ../arm/gemv_n.c | ||||
| CGEMVNKERNEL = ../arm/zgemv_n.c | ||||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | ||||
| 
 | ||||
| SGEMVTKERNEL = ../arm/gemv_t.c | ||||
| DGEMVTKERNEL = ../arm/gemv_t.c | ||||
| CGEMVTKERNEL = ../arm/zgemv_t.c | ||||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | ||||
| 
 | ||||
| STRMMKERNEL	= ../generic/trmmkernel_2x2.c | ||||
| DTRMMKERNEL	= ../generic/trmmkernel_2x2.c | ||||
| CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c | ||||
| ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c | ||||
| 
 | ||||
| SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c | ||||
| SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c | ||||
| SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c | ||||
| SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c | ||||
| DGEMMONCOPY    = ../generic/gemm_ncopy_2.c | ||||
| DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c | ||||
| CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c | ||||
| CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c | ||||
| CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c | ||||
| ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c | ||||
| ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c | ||||
| ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c | ||||
| STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c | ||||
| STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c | ||||
| STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c | ||||
| ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c | ||||
| ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c | ||||
| ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| 
 | ||||
| SCABS_KERNEL	= ../generic/cabs.c | ||||
| DCABS_KERNEL	= ../generic/cabs.c | ||||
| QCABS_KERNEL	= ../generic/cabs.c | ||||
| LSAME_KERNEL	= ../generic/lsame.c | ||||
| 
 | ||||
| SGEMM_BETA = ../generic/gemm_beta.c | ||||
| DGEMM_BETA = ../generic/gemm_beta.c | ||||
| CGEMM_BETA = ../generic/zgemm_beta.c | ||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| 
 | ||||
| 
 | ||||
|  | @ -0,0 +1 @@ | |||
| clean :: | ||||
|  | @ -1 +1,14 @@ | |||
| #TODO: Add loongarch64 SIMD optimizations | ||||
| DGEMMKERNEL    = dgemm_kernel_16x4.S | ||||
| DGEMMINCOPY    = dgemm_ncopy_16.S | ||||
| DGEMMITCOPY    = dgemm_tcopy_16.S | ||||
| DGEMMONCOPY    = dgemm_ncopy_4.S | ||||
| DGEMMOTCOPY    = dgemm_tcopy_4.S | ||||
| DGEMMINCOPYOBJ = dgemm_incopy.o | ||||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | ||||
| 
 | ||||
| DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c | ||||
| DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c | ||||
| DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c | ||||
|  |  | |||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -0,0 +1,691 @@ | |||
| /******************************************************************************* | ||||
| Copyright (c) 2021, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
| notice, this list of conditions and the following disclaimer. | ||||
| 2. Redistributions in binary form must reproduce the above copyright | ||||
| notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | ||||
| distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | ||||
| its contributors may be used to endorse or promote products | ||||
| derived from this software without specific prior written permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *******************************************************************************/ | ||||
| #define ASSEMBLER | ||||
| 
 | ||||
| #include "common.h" | ||||
| 
 | ||||
| /* Function parameters */ | ||||
| #define M      $r4    // param 1: m | ||||
| #define N      $r5    // param 2: n | ||||
| #define SRC    $r6    // param 3: src | ||||
| #define LDA    $r7    // param 4: lda | ||||
| #define DST    $r8    // param 5: dst | ||||
| 
 | ||||
| #define I      $r9 | ||||
| #define J      $r10 | ||||
| #define S1     $r12 | ||||
| #define S2     $r13 | ||||
| #define S3     $r14 | ||||
| #define S4     $r15 | ||||
| #define S5     $r16 | ||||
| #define S6     $r17 | ||||
| #define S7     $r18 | ||||
| #define S8     $r19 | ||||
| #define S9     $r20 | ||||
| #define S10    $r23 | ||||
| #define S11    $r24 | ||||
| #define S12    $r25 | ||||
| #define S13    $r26 | ||||
| #define S14    $r27 | ||||
| #define S15    $r28 | ||||
| #define S16    $r29 | ||||
| #define TD     $r30 | ||||
| #define TS     $r31 | ||||
| #define TL     $r7 | ||||
| #define T0     $r6 | ||||
| #define ZERO   $r0 | ||||
| 
 | ||||
| #define F0     $f0 | ||||
| #define F1     $f1 | ||||
| #define F2     $f2 | ||||
| #define F3     $f3 | ||||
| #define F4     $f4 | ||||
| #define F5     $f5 | ||||
| #define F6     $f6 | ||||
| #define F7     $f7 | ||||
| /* LASX vectors */ | ||||
| #define U0     $xr0 | ||||
| #define U1     $xr1 | ||||
| #define U2     $xr2 | ||||
| #define U3     $xr3 | ||||
| #define U4     $xr4 | ||||
| #define U5     $xr5 | ||||
| #define U6     $xr6 | ||||
| #define U7     $xr7 | ||||
| #define U8     $xr8 | ||||
| #define U9     $xr9 | ||||
| #define U10    $xr10 | ||||
| #define U11    $xr11 | ||||
| #define U12    $xr12 | ||||
| #define U13    $xr13 | ||||
| #define U14    $xr14 | ||||
| #define U15    $xr15 | ||||
| #define D0     $xr16 | ||||
| #define D1     $xr17 | ||||
| #define D2     $xr18 | ||||
| #define D3     $xr19 | ||||
| #define D4     $xr20 | ||||
| #define D5     $xr21 | ||||
| #define D6     $xr22 | ||||
| #define D7     $xr23 | ||||
| #define D8     $xr24 | ||||
| #define D9     $xr25 | ||||
| #define D10    $xr26 | ||||
| #define D11    $xr27 | ||||
| #define D12    $xr28 | ||||
| #define D13    $xr29 | ||||
| #define D14    $xr30 | ||||
| #define D15    $xr31 | ||||
| 
 | ||||
|     PROLOGUE | ||||
| 
 | ||||
|     addi.d     $sp,  $sp,  -0x90 | ||||
|     SDARG      $r23, $sp,  0x00 | ||||
|     SDARG      $r24, $sp,  0x08 | ||||
|     SDARG      $r25, $sp,  0x10 | ||||
|     SDARG      $r26, $sp,  0x18 | ||||
|     SDARG      $r27, $sp,  0x20 | ||||
|     SDARG      $r28, $sp,  0x28 | ||||
|     SDARG      $r29, $sp,  0x30 | ||||
|     SDARG      $r30, $sp,  0x38 | ||||
|     SDARG      $r31, $sp,  0x40 | ||||
|     ST         $f23, $sp,  0x48 | ||||
|     ST         $f24, $sp,  0x50 | ||||
|     ST         $f25, $sp,  0x58 | ||||
|     ST         $f26, $sp,  0x60 | ||||
|     ST         $f27, $sp,  0x68 | ||||
|     ST         $f28, $sp,  0x70 | ||||
|     ST         $f29, $sp,  0x78 | ||||
|     ST         $f30, $sp,  0x80 | ||||
|     ST         $f31, $sp,  0x88 | ||||
| 
 | ||||
|     move       TD,   DST | ||||
|     move       TS,   SRC | ||||
|     slli.d     TL,   LDA,  0x03 | ||||
|     slli.d     T0,   TL,   0x01 | ||||
|     srai.d     J,    N,    0x04 | ||||
|     beq        J,    ZERO, .L_N8 | ||||
| 
 | ||||
| .L_J1: /* J-- */ | ||||
|     move       S1,   TS | ||||
|     add.d      S2,   TS,   TL | ||||
|     srai.d     I,    M,    0x03 | ||||
|     add.d      S3,   S2,   TL | ||||
|     addi.d     J,    J,    -1 | ||||
|     add.d      S4,   S3,   TL | ||||
|     add.d      S5,   S3,   T0 | ||||
|     add.d      S6,   S4,   T0 | ||||
|     add.d      S7,   S5,   T0 | ||||
|     add.d      S8,   S6,   T0 | ||||
|     add.d      S9,   S7,   T0 | ||||
|     add.d      S10,  S8,   T0 | ||||
|     add.d      S11,  S9,   T0 | ||||
|     add.d      S12,  S10,  T0 | ||||
|     add.d      S13,  S11,  T0 | ||||
|     add.d      S14,  S12,  T0 | ||||
|     add.d      S15,  S13,  T0 | ||||
|     add.d      S16,  S14,  T0 | ||||
|     add.d      TS,   S15,  T0 | ||||
|     beq        I,    ZERO, .L_I7 | ||||
| 
 | ||||
| .L_I1: /* I-- */ | ||||
|     xvld       U0,   S1,   0x00 | ||||
|     xvld       U1,   S2,   0x00 | ||||
|     xvld       U2,   S3,   0x00 | ||||
|     xvld       U3,   S4,   0x00 | ||||
|     xvld       U4,   S5,   0x00 | ||||
|     xvld       U5,   S6,   0x00 | ||||
|     xvld       U6,   S7,   0x00 | ||||
|     xvld       U7,   S8,   0x00 | ||||
|     xvld       U8,   S9,   0x00 | ||||
|     xvld       U9,   S10,  0x00 | ||||
|     xvld       U10,  S11,  0x00 | ||||
|     xvld       U11,  S12,  0x00 | ||||
|     xvld       U12,  S13,  0x00 | ||||
|     xvld       U13,  S14,  0x00 | ||||
|     xvld       U14,  S15,  0x00 | ||||
|     xvld       U15,  S16,  0x00 | ||||
| 
 | ||||
|     xvpackev.d D0,   U1,   U0 | ||||
|     xvpackod.d D1,   U1,   U0 | ||||
|     xvpackev.d D2,   U3,   U2 | ||||
|     xvpackod.d D3,   U3,   U2 | ||||
|     xvpackev.d D4,   U5,   U4 | ||||
|     xvpackod.d D5,   U5,   U4 | ||||
|     xvpackev.d D6,   U7,   U6 | ||||
|     xvpackod.d D7,   U7,   U6 | ||||
| 
 | ||||
|     xvpackev.d D8,   U9,   U8 | ||||
|     xvpackod.d D9,   U9,   U8 | ||||
|     xvpackev.d D10,  U11,  U10 | ||||
|     xvpackod.d D11,  U11,  U10 | ||||
|     xvpackev.d D12,  U13,  U12 | ||||
|     xvpackod.d D13,  U13,  U12 | ||||
|     xvpackev.d D14,  U15,  U14 | ||||
|     xvpackod.d D15,  U15,  U14 | ||||
| 
 | ||||
|     xvand.v    U0,   D0,   D0 | ||||
|     xvpermi.q  D0,   D2,   0x02  // 0 | ||||
|     xvand.v    U4,   D4,   D4 | ||||
|     xvpermi.q  D4,   D6,   0x02  // 1 | ||||
|     xvand.v    U1,   D1,   D1 | ||||
|     xvpermi.q  D1,   D3,   0x02  // 4 | ||||
|     xvand.v    U5,   D5,   D5 | ||||
|     xvpermi.q  D5,   D7,   0x02  // 5 | ||||
|     xvpermi.q  D2,   U0,   0x31  // 8 | ||||
|     xvpermi.q  D6,   U4,   0x31  // 9 | ||||
|     xvpermi.q  D3,   U1,   0x31  // 12 | ||||
|     xvpermi.q  D7,   U5,   0x31  // 13 | ||||
| 
 | ||||
|     xvand.v    U8,   D8,   D8 | ||||
|     xvpermi.q  D8,   D10,  0x02  // 2 | ||||
|     xvand.v    U12,  D12,  D12 | ||||
|     xvpermi.q  D12,  D14,  0x02  // 3 | ||||
|     xvand.v    U9,   D9,   D9 | ||||
|     xvpermi.q  D9,   D11,  0x02  // 6 | ||||
|     xvand.v    U13,  D13,  D13 | ||||
|     xvpermi.q  D13,  D15,  0x02  // 7 | ||||
|     xvpermi.q  D10,  U8,   0x31  // 10 | ||||
|     xvpermi.q  D14,  U12,  0x31  // 11 | ||||
|     xvpermi.q  D11,  U9,   0x31  // 14 | ||||
|     xvpermi.q  D15,  U13,  0x31  // 15 | ||||
| 
 | ||||
|     xvst       D0,   TD,   0x00  // 0 | ||||
|     xvst       D4,   TD,   0x20  // 1 | ||||
|     xvst       D8,   TD,   0x40  // 2 | ||||
|     xvst       D12,  TD,   0x60  // 3 | ||||
|     xvst       D1,   TD,   0x80  // 4 | ||||
|     xvst       D5,   TD,   0xA0  // 5 | ||||
|     xvst       D9,   TD,   0xC0  // 6 | ||||
|     xvst       D13,  TD,   0xE0  // 7 | ||||
|     addi.d     TD,   TD,   0x100 | ||||
|     xvst       D2,   TD,   0x00  // 8 | ||||
|     xvst       D6,   TD,   0x20  // 9 | ||||
|     xvst       D10,  TD,   0x40  // 10 | ||||
|     xvst       D14,  TD,   0x60  // 11 | ||||
|     xvst       D3,   TD,   0x80  // 12 | ||||
|     xvst       D7,   TD,   0xA0  // 13 | ||||
|     xvst       D11,  TD,   0xC0  // 14 | ||||
|     xvst       D15,  TD,   0xE0  // 15 | ||||
|     addi.d     TD,   TD,   0x100 | ||||
| 
 | ||||
|     xvld       U0,   S1,   0x20 | ||||
|     xvld       U1,   S2,   0x20 | ||||
|     xvld       U2,   S3,   0x20 | ||||
|     xvld       U3,   S4,   0x20 | ||||
|     xvld       U4,   S5,   0x20 | ||||
|     xvld       U5,   S6,   0x20 | ||||
|     xvld       U6,   S7,   0x20 | ||||
|     xvld       U7,   S8,   0x20 | ||||
|     xvld       U8,   S9,   0x20 | ||||
|     xvld       U9,   S10,  0x20 | ||||
|     xvld       U10,  S11,  0x20 | ||||
|     xvld       U11,  S12,  0x20 | ||||
|     xvld       U12,  S13,  0x20 | ||||
|     xvld       U13,  S14,  0x20 | ||||
|     xvld       U14,  S15,  0x20 | ||||
|     xvld       U15,  S16,  0x20 | ||||
| 
 | ||||
|     xvpackev.d D0,   U1,   U0 | ||||
|     xvpackod.d D1,   U1,   U0 | ||||
|     xvpackev.d D2,   U3,   U2 | ||||
|     xvpackod.d D3,   U3,   U2 | ||||
|     xvpackev.d D4,   U5,   U4 | ||||
|     xvpackod.d D5,   U5,   U4 | ||||
|     xvpackev.d D6,   U7,   U6 | ||||
|     xvpackod.d D7,   U7,   U6 | ||||
| 
 | ||||
|     xvpackev.d D8,   U9,   U8 | ||||
|     xvpackod.d D9,   U9,   U8 | ||||
|     xvpackev.d D10,  U11,  U10 | ||||
|     xvpackod.d D11,  U11,  U10 | ||||
|     xvpackev.d D12,  U13,  U12 | ||||
|     xvpackod.d D13,  U13,  U12 | ||||
|     xvpackev.d D14,  U15,  U14 | ||||
|     xvpackod.d D15,  U15,  U14 | ||||
| 
 | ||||
|     xvand.v    U0,   D0,   D0 | ||||
|     xvpermi.q  D0,   D2,   0x02  // 0 | ||||
|     xvand.v    U4,   D4,   D4 | ||||
|     xvpermi.q  D4,   D6,   0x02  // 1 | ||||
|     xvand.v    U1,   D1,   D1 | ||||
|     xvpermi.q  D1,   D3,   0x02  // 4 | ||||
|     xvand.v    U5,   D5,   D5 | ||||
|     xvpermi.q  D5,   D7,   0x02  // 5 | ||||
|     xvpermi.q  D2,   U0,   0x31  // 8 | ||||
|     xvpermi.q  D6,   U4,   0x31  // 9 | ||||
|     xvpermi.q  D3,   U1,   0x31  // 12 | ||||
|     xvpermi.q  D7,   U5,   0x31  // 13 | ||||
| 
 | ||||
|     xvand.v    U8,   D8,   D8 | ||||
|     xvpermi.q  D8,   D10,  0x02  // 2 | ||||
|     xvand.v    U12,  D12,  D12 | ||||
|     xvpermi.q  D12,  D14,  0x02  // 3 | ||||
|     xvand.v    U9,   D9,   D9 | ||||
|     xvpermi.q  D9,   D11,  0x02  // 6 | ||||
|     xvand.v    U13,  D13,  D13 | ||||
|     xvpermi.q  D13,  D15,  0x02  // 7 | ||||
|     xvpermi.q  D10,  U8,   0x31  // 10 | ||||
|     xvpermi.q  D14,  U12,  0x31  // 11 | ||||
|     xvpermi.q  D11,  U9,   0x31  // 14 | ||||
|     xvpermi.q  D15,  U13,  0x31  // 15 | ||||
| 
 | ||||
|     xvst       D0,   TD,   0x00  // 0 | ||||
|     xvst       D4,   TD,   0x20  // 1 | ||||
|     xvst       D8,   TD,   0x40  // 2 | ||||
|     xvst       D12,  TD,   0x60  // 3 | ||||
|     xvst       D1,   TD,   0x80  // 4 | ||||
|     xvst       D5,   TD,   0xA0  // 5 | ||||
|     xvst       D9,   TD,   0xC0  // 6 | ||||
|     xvst       D13,  TD,   0xE0  // 7 | ||||
|     addi.d     TD,   TD,   0x100 | ||||
|     xvst       D2,   TD,   0x00  // 8 | ||||
|     xvst       D6,   TD,   0x20  // 9 | ||||
|     xvst       D10,  TD,   0x40  // 10 | ||||
|     xvst       D14,  TD,   0x60  // 11 | ||||
|     xvst       D3,   TD,   0x80  // 12 | ||||
|     xvst       D7,   TD,   0xA0  // 13 | ||||
|     xvst       D11,  TD,   0xC0  // 14 | ||||
|     xvst       D15,  TD,   0xE0  // 15 | ||||
|     addi.d     TD,   TD,   0x100 | ||||
| 
 | ||||
| 
 | ||||
|     addi.d     S1,   S1,   0x40 | ||||
|     addi.d     S2,   S2,   0x40 | ||||
|     addi.d     S3,   S3,   0x40 | ||||
|     addi.d     S4,   S4,   0x40 | ||||
|     addi.d     S5,   S5,   0x40 | ||||
|     addi.d     S6,   S6,   0x40 | ||||
|     addi.d     S7,   S7,   0x40 | ||||
|     addi.d     S8,   S8,   0x40 | ||||
|     addi.d     S9,   S9,   0x40 | ||||
|     addi.d     S10,  S10,  0x40 | ||||
|     addi.d     S11,  S11,  0x40 | ||||
|     addi.d     S12,  S12,  0x40 | ||||
|     addi.d     S13,  S13,  0x40 | ||||
|     addi.d     S14,  S14,  0x40 | ||||
|     addi.d     S15,  S15,  0x40 | ||||
|     addi.d     S16,  S16,  0x40 | ||||
| 
 | ||||
|     addi.d     I,    I,    -1 | ||||
|     blt        ZERO, I,    .L_I1 | ||||
| 
 | ||||
| .L_I7: | ||||
|     andi      I,     M,    0x07 | ||||
|     beq       I,     ZERO, .L_I0 | ||||
| 
 | ||||
| .L_II1: /* I-- */ | ||||
|     fld.d     F0,    S1,  0x00 | ||||
|     fld.d     F1,    S2,  0x00 | ||||
|     fld.d     F2,    S3,  0x00 | ||||
|     fld.d     F3,    S4,  0x00 | ||||
|     fld.d     F4,    S5,  0x00 | ||||
|     fld.d     F5,    S6,  0x00 | ||||
|     fld.d     F6,    S7,  0x00 | ||||
|     fld.d     F7,    S8,  0x00 | ||||
| 
 | ||||
|     fst.d     F0,    TD,  0x00 | ||||
|     addi.d    S1,    S1,  0x08 | ||||
|     fst.d     F1,    TD,  0x08 | ||||
|     addi.d    S2,    S2,  0x08 | ||||
|     fst.d     F2,    TD,  0x10 | ||||
|     addi.d    S3,    S3,  0x08 | ||||
|     fst.d     F3,    TD,  0x18 | ||||
|     addi.d    S4,    S4,  0x08 | ||||
|     fst.d     F4,    TD,  0x20 | ||||
|     addi.d    S5,    S5,  0x08 | ||||
|     fst.d     F5,    TD,  0x28 | ||||
|     addi.d    S6,    S6,  0x08 | ||||
|     fst.d     F6,    TD,  0x30 | ||||
|     addi.d    S7,    S7,  0x08 | ||||
|     fst.d     F7,    TD,  0x38 | ||||
|     addi.d    S8,    S8,  0x08 | ||||
|     addi.d    TD,    TD,  0x40 | ||||
| 
 | ||||
|     fld.d     F0,    S9,  0x00 | ||||
|     fld.d     F1,    S10, 0x00 | ||||
|     fld.d     F2,    S11, 0x00 | ||||
|     fld.d     F3,    S12, 0x00 | ||||
|     fld.d     F4,    S13, 0x00 | ||||
|     fld.d     F5,    S14, 0x00 | ||||
|     fld.d     F6,    S15, 0x00 | ||||
|     fld.d     F7,    S16, 0x00 | ||||
| 
 | ||||
|     fst.d     F0,    TD,  0x00 | ||||
|     addi.d    S9,    S9,  0x08 | ||||
|     fst.d     F1,    TD,  0x08 | ||||
|     addi.d    S10,   S10, 0x08 | ||||
|     fst.d     F2,    TD,  0x10 | ||||
|     addi.d    S11,   S11, 0x08 | ||||
|     fst.d     F3,    TD,  0x18 | ||||
|     addi.d    S12,   S12, 0x08 | ||||
|     fst.d     F4,    TD,  0x20 | ||||
|     addi.d    S13,   S13, 0x08 | ||||
|     fst.d     F5,    TD,  0x28 | ||||
|     addi.d    S14,   S14, 0x08 | ||||
|     fst.d     F6,    TD,  0x30 | ||||
|     addi.d    S15,   S15, 0x08 | ||||
|     fst.d     F7,    TD,  0x38 | ||||
|     addi.d    S16,   S16, 0x08 | ||||
|     addi.d    TD,    TD,  0x40 | ||||
| 
 | ||||
|     addi.d    I,     I,   -1 | ||||
|     blt       ZERO,  I,   .L_II1 | ||||
| 
 | ||||
| .L_I0: | ||||
|     blt       ZERO,  J,   .L_J1 | ||||
| 
 | ||||
| .L_N8: | ||||
|     andi      J,     N,   0x08 | ||||
|     beq       ZERO,  J,   .L_N4 | ||||
| 
 | ||||
|     move       S1,   TS | ||||
|     add.d      S2,   TS,   TL | ||||
|     srai.d     I,    M,    0x03 | ||||
|     add.d      S3,   S2,   TL | ||||
|     add.d      S4,   S2,   T0 | ||||
|     add.d      S5,   S3,   T0 | ||||
|     add.d      S6,   S4,   T0 | ||||
|     add.d      S7,   S5,   T0 | ||||
|     add.d      S8,   S6,   T0 | ||||
|     add.d      TS,   S7,   T0 | ||||
|     beq        I,    ZERO, .L_8I3 | ||||
| 
 | ||||
| .L_8I1:  /* I-- */ | ||||
|     xvld       U0,   S1,   0x00 | ||||
|     xvld       U1,   S2,   0x00 | ||||
|     xvld       U2,   S3,   0x00 | ||||
|     xvld       U3,   S4,   0x00 | ||||
|     xvld       U4,   S5,   0x00 | ||||
|     xvld       U5,   S6,   0x00 | ||||
|     xvld       U6,   S7,   0x00 | ||||
|     xvld       U7,   S8,   0x00 | ||||
| 
 | ||||
|     xvpackev.d D0,   U1,   U0 | ||||
|     xvpackod.d D1,   U1,   U0 | ||||
|     xvpackev.d D2,   U3,   U2 | ||||
|     xvpackod.d D3,   U3,   U2 | ||||
|     xvpackev.d D4,   U5,   U4 | ||||
|     xvpackod.d D5,   U5,   U4 | ||||
|     xvpackev.d D6,   U7,   U6 | ||||
|     xvpackod.d D7,   U7,   U6 | ||||
| 
 | ||||
|     xvand.v    U0,   D0,   D0 | ||||
|     xvpermi.q  D0,   D2,   0x02  // 0 | ||||
|     xvand.v    U4,   D4,   D4 | ||||
|     xvpermi.q  D4,   D6,   0x02  // 1 | ||||
|     xvand.v    U1,   D1,   D1 | ||||
|     xvpermi.q  D1,   D3,   0x02  // 2 | ||||
|     xvand.v    U5,   D5,   D5 | ||||
|     xvpermi.q  D5,   D7,   0x02  // 3 | ||||
|     xvpermi.q  D2,   U0,   0x31  // 4 | ||||
|     xvpermi.q  D6,   U4,   0x31  // 5 | ||||
|     xvpermi.q  D3,   U1,   0x31  // 6 | ||||
|     xvpermi.q  D7,   U5,   0x31  // 7 | ||||
| 
 | ||||
|     xvst       D0,   TD,   0x00 | ||||
|     xvst       D4,   TD,   0x20 | ||||
|     xvst       D1,   TD,   0x40 | ||||
|     xvst       D5,   TD,   0x60 | ||||
|     xvst       D2,   TD,   0x80 | ||||
|     xvst       D6,   TD,   0xA0 | ||||
|     xvst       D3,   TD,   0xC0 | ||||
|     xvst       D7,   TD,   0xE0 | ||||
|     addi.d     TD,   TD,   0x100 | ||||
| 
 | ||||
|     xvld       U0,   S1,   0x20 | ||||
|     xvld       U1,   S2,   0x20 | ||||
|     xvld       U2,   S3,   0x20 | ||||
|     xvld       U3,   S4,   0x20 | ||||
|     xvld       U4,   S5,   0x20 | ||||
|     xvld       U5,   S6,   0x20 | ||||
|     xvld       U6,   S7,   0x20 | ||||
|     xvld       U7,   S8,   0x20 | ||||
| 
 | ||||
|     xvpackev.d D0,   U1,   U0 | ||||
|     xvpackod.d D1,   U1,   U0 | ||||
|     xvpackev.d D2,   U3,   U2 | ||||
|     xvpackod.d D3,   U3,   U2 | ||||
|     xvpackev.d D4,   U5,   U4 | ||||
|     xvpackod.d D5,   U5,   U4 | ||||
|     xvpackev.d D6,   U7,   U6 | ||||
|     xvpackod.d D7,   U7,   U6 | ||||
| 
 | ||||
|     xvand.v    U0,   D0,   D0 | ||||
|     xvpermi.q  D0,   D2,   0x02  // 0 | ||||
|     xvand.v    U4,   D4,   D4 | ||||
|     xvpermi.q  D4,   D6,   0x02  // 1 | ||||
|     xvand.v    U1,   D1,   D1 | ||||
|     xvpermi.q  D1,   D3,   0x02  // 2 | ||||
|     xvand.v    U5,   D5,   D5 | ||||
|     xvpermi.q  D5,   D7,   0x02  // 3 | ||||
|     xvpermi.q  D2,   U0,   0x31  // 4 | ||||
|     xvpermi.q  D6,   U4,   0x31  // 5 | ||||
|     xvpermi.q  D3,   U1,   0x31  // 6 | ||||
|     xvpermi.q  D7,   U5,   0x31  // 7 | ||||
| 
 | ||||
|     xvst       D0,   TD,   0x00 | ||||
|     xvst       D4,   TD,   0x20 | ||||
|     xvst       D1,   TD,   0x40 | ||||
|     xvst       D5,   TD,   0x60 | ||||
|     xvst       D2,   TD,   0x80 | ||||
|     xvst       D6,   TD,   0xA0 | ||||
|     xvst       D3,   TD,   0xC0 | ||||
|     xvst       D7,   TD,   0xE0 | ||||
|     addi.d     TD,   TD,   0x100 | ||||
| 
 | ||||
|     addi.d     S1,   S1,   0x40 | ||||
|     addi.d     S2,   S2,   0x40 | ||||
|     addi.d     S3,   S3,   0x40 | ||||
|     addi.d     S4,   S4,   0x40 | ||||
|     addi.d     S5,   S5,   0x40 | ||||
|     addi.d     S6,   S6,   0x40 | ||||
|     addi.d     S7,   S7,   0x40 | ||||
|     addi.d     S8,   S8,   0x40 | ||||
| 
 | ||||
|     addi.d     I,    I,    -1 | ||||
|     blt        ZERO, I,    .L_8I1 | ||||
| 
 | ||||
| .L_8I3: | ||||
|     andi      I,     M,    0x07 | ||||
|     beq       I,     ZERO, .L_N4 | ||||
| 
 | ||||
| .L_8I11: | ||||
|     fld.d     F0,    S1,  0x00 | ||||
|     fld.d     F1,    S2,  0x00 | ||||
|     fld.d     F2,    S3,  0x00 | ||||
|     fld.d     F3,    S4,  0x00 | ||||
|     fld.d     F4,    S5,  0x00 | ||||
|     fld.d     F5,    S6,  0x00 | ||||
|     fld.d     F6,    S7,  0x00 | ||||
|     fld.d     F7,    S8,  0x00 | ||||
| 
 | ||||
|     fst.d     F0,    TD,  0x00 | ||||
|     addi.d    S1,    S1,  0x08 | ||||
|     fst.d     F1,    TD,  0x08 | ||||
|     addi.d    S2,    S2,  0x08 | ||||
|     fst.d     F2,    TD,  0x10 | ||||
|     addi.d    S3,    S3,  0x08 | ||||
|     fst.d     F3,    TD,  0x18 | ||||
|     addi.d    S4,    S4,  0x08 | ||||
|     fst.d     F4,    TD,  0x20 | ||||
|     addi.d    S5,    S5,  0x08 | ||||
|     fst.d     F5,    TD,  0x28 | ||||
|     addi.d    S6,    S6,  0x08 | ||||
|     fst.d     F6,    TD,  0x30 | ||||
|     addi.d    S7,    S7,  0x08 | ||||
|     fst.d     F7,    TD,  0x38 | ||||
|     addi.d    S8,    S8,  0x08 | ||||
| 
 | ||||
|     addi.d    TD,    TD,  0x40 | ||||
|     addi.d    I,     I,   -1 | ||||
|     blt       ZERO,  I,   .L_8I11 | ||||
| 
 | ||||
| .L_N4: | ||||
|     andi      J,     N,   0x04 | ||||
|     beq       ZERO,  J,   .L_N2 | ||||
| 
 | ||||
|     move       S1,   TS | ||||
|     add.d      S2,   TS,   TL | ||||
|     srai.d     I,    M,    0x02 | ||||
|     add.d      S3,   S2,   TL | ||||
|     add.d      S4,   S2,   T0 | ||||
|     add.d      TS,   S3,   T0 | ||||
|     beq        I,    ZERO, .L_I3 | ||||
| 
 | ||||
| .L_4I1: /* I-- */ | ||||
|     xvld       U0,   S1,   0x00 | ||||
|     xvld       U1,   S2,   0x00 | ||||
|     xvld       U2,   S3,   0x00 | ||||
|     xvld       U3,   S4,   0x00 | ||||
| 
 | ||||
|     xvpackev.d D0,   U1,   U0 | ||||
|     xvpackod.d D1,   U1,   U0 | ||||
|     xvpackev.d D2,   U3,   U2 | ||||
|     xvpackod.d D3,   U3,   U2 | ||||
| 
 | ||||
|     xvand.v    U0,   D0,   D0 | ||||
|     xvpermi.q  D0,   D2,   0x02  // 0 | ||||
|     xvand.v    U1,   D1,   D1 | ||||
|     xvpermi.q  D1,   D3,   0x02  // 1 | ||||
|     xvpermi.q  D2,   U0,   0x31  // 2 | ||||
|     xvpermi.q  D3,   U1,   0x31  // 3 | ||||
| 
 | ||||
|     xvst       D0,   TD,   0x00 | ||||
|     xvst       D1,   TD,   0x20 | ||||
|     xvst       D2,   TD,   0x40 | ||||
|     xvst       D3,   TD,   0x60 | ||||
| 
 | ||||
|     addi.d     S1,   S1,   0x20 | ||||
|     addi.d     S2,   S2,   0x20 | ||||
|     addi.d     S3,   S3,   0x20 | ||||
|     addi.d     S4,   S4,   0x20 | ||||
|     addi.d     TD,   TD,   0x80 | ||||
| 
 | ||||
|     addi.d     I,    I,    -1 | ||||
|     blt        ZERO, I,    .L_4I1 | ||||
| 
 | ||||
| .L_I3: | ||||
|     andi      I,     M,    0x03 | ||||
|     beq       I,     ZERO, .L_N2 | ||||
| 
 | ||||
| .L_4II1: | ||||
|     fld.d     F0,    S1,  0x00 | ||||
|     fld.d     F1,    S2,  0x00 | ||||
|     fld.d     F2,    S3,  0x00 | ||||
|     fld.d     F3,    S4,  0x00 | ||||
| 
 | ||||
|     fst.d     F0,    TD,  0x00 | ||||
|     addi.d    S1,    S1,  0x08 | ||||
|     fst.d     F1,    TD,  0x08 | ||||
|     addi.d    S2,    S2,  0x08 | ||||
|     fst.d     F2,    TD,  0x10 | ||||
|     addi.d    S3,    S3,  0x08 | ||||
|     fst.d     F3,    TD,  0x18 | ||||
|     addi.d    S4,    S4,  0x08 | ||||
| 
 | ||||
|     addi.d    TD,    TD,  0x20 | ||||
|     addi.d    I,     I,   -1 | ||||
|     blt       ZERO,  I,   .L_4II1 | ||||
| 
 | ||||
| .L_N2: | ||||
|     andi      J,     N,   0x02 | ||||
|     beq       ZERO,  J,   .L_N1 | ||||
| 
 | ||||
|     move       S1,   TS | ||||
|     add.d      S2,   TS,   TL | ||||
|     srai.d     I,    M,    0x01 | ||||
|     add.d      TS,   S2,   TL | ||||
|     beq        I,    ZERO, .L_NI1 | ||||
| 
 | ||||
| .L_2I1: /* I-- */ | ||||
|     xvld       U0,   S1,   0x00 | ||||
|     xvld       U1,   S2,   0x00 | ||||
| 
 | ||||
|     xvpackev.d D0,   U1,   U0 | ||||
|     xvpackod.d D1,   U1,   U0 | ||||
| 
 | ||||
|     xvpermi.q  D0,   D1,   0x02  // 0 | ||||
| 
 | ||||
|     xvst       D0,   TD,   0x00 | ||||
| 
 | ||||
|     addi.d     S1,   S1,   0x10 | ||||
|     addi.d     S2,   S2,   0x10 | ||||
|     addi.d     TD,   TD,   0x20 | ||||
| 
 | ||||
|     addi.d     I,    I,    -1 | ||||
|     blt        ZERO, I,    .L_2I1 | ||||
| 
 | ||||
| .L_NI1: | ||||
|     andi      I,     M,    0x01 | ||||
|     beq       I,     ZERO, .L_N1 | ||||
| 
 | ||||
| 
 | ||||
|     fld.d     F0,    S1,  0x00 | ||||
|     fld.d     F1,    S2,  0x00 | ||||
| 
 | ||||
|     fst.d     F0,    TD,  0x00 | ||||
|     addi.d    S1,    S1,  0x08 | ||||
|     fst.d     F1,    TD,  0x08 | ||||
|     addi.d    S2,    S2,  0x08 | ||||
|     addi.d    TD,    TD,  0x10 | ||||
| 
 | ||||
| .L_N1: | ||||
|     move      S1,    TS | ||||
|     beq       ZERO,  M,   .L_N0 | ||||
| 
 | ||||
| .L_M1: | ||||
|     fld.d     F0,    S1,  0x00 | ||||
|     addi.d    S1,    S1,  0x08 | ||||
|     fst.d     F0,    TD,  0x00 | ||||
|     addi.d    TD,    TD,  0x08 | ||||
|     addi.d    M,     M,   -1 | ||||
|     blt       ZERO,  M,   .L_M1 | ||||
| 
 | ||||
| .L_N0: | ||||
|     LDARG      $r23, $sp,  0x00 | ||||
|     LDARG      $r24, $sp,  0x08 | ||||
|     LDARG      $r25, $sp,  0x10 | ||||
|     LDARG      $r26, $sp,  0x18 | ||||
|     LDARG      $r27, $sp,  0x20 | ||||
|     LDARG      $r28, $sp,  0x28 | ||||
|     LDARG      $r29, $sp,  0x30 | ||||
|     LDARG      $r30, $sp,  0x38 | ||||
|     LDARG      $r31, $sp,  0x40 | ||||
|     LD         $f23, $sp,  0x48 | ||||
|     LD         $f24, $sp,  0x50 | ||||
|     LD         $f25, $sp,  0x58 | ||||
|     LD         $f26, $sp,  0x60 | ||||
|     LD         $f27, $sp,  0x68 | ||||
|     LD         $f28, $sp,  0x70 | ||||
|     LD         $f29, $sp,  0x78 | ||||
|     LD         $f30, $sp,  0x80 | ||||
|     LD         $f31, $sp,  0x88 | ||||
|     addi.d     $sp,  $sp,  0x90 | ||||
|     jirl       $r0,  $r1,  0x00 | ||||
| 
 | ||||
|     EPILOGUE | ||||
|  | @ -0,0 +1,237 @@ | |||
| /******************************************************************************* | ||||
| Copyright (c) 2021, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
| notice, this list of conditions and the following disclaimer. | ||||
| 2. Redistributions in binary form must reproduce the above copyright | ||||
| notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | ||||
| distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | ||||
| its contributors may be used to endorse or promote products | ||||
| derived from this software without specific prior written permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *******************************************************************************/ | ||||
| #define ASSEMBLER | ||||
| 
 | ||||
| #include "common.h" | ||||
| 
 | ||||
| /* Function parameters */ | ||||
| #define M      $r4    // param 1: m | ||||
| #define N      $r5    // param 2: n | ||||
| #define SRC    $r6    // param 3: src | ||||
| #define LDA    $r7    // param 4: lda | ||||
| #define DST    $r8    // param 5: dst | ||||
| 
 | ||||
| #define I      $r9 | ||||
| #define J      $r10 | ||||
| #define S1     $r12 | ||||
| #define S2     $r13 | ||||
| #define S3     $r14 | ||||
| #define S4     $r15 | ||||
| #define S5     $r16 | ||||
| #define S6     $r17 | ||||
| #define S7     $r18 | ||||
| #define S8     $r19 | ||||
| #define TD     $r20 | ||||
| #define TS     $r11 | ||||
| #define TL     $r7 | ||||
| #define T0     $r23 | ||||
| #define ZERO   $r0 | ||||
| 
 | ||||
| #define F0     $f0 | ||||
| #define F1     $f1 | ||||
| #define F2     $f2 | ||||
| #define F3     $f3 | ||||
| #define F4     $f4 | ||||
| #define F5     $f5 | ||||
| #define F6     $f6 | ||||
| #define F7     $f7 | ||||
| /* LASX vectors */ | ||||
| #define U0     $xr0 | ||||
| #define U1     $xr1 | ||||
| #define U2     $xr2 | ||||
| #define U3     $xr3 | ||||
| #define U4     $xr4 | ||||
| #define U5     $xr5 | ||||
| #define U6     $xr6 | ||||
| #define U7     $xr7 | ||||
| #define D0     $xr14 | ||||
| #define D1     $xr8 | ||||
| #define D2     $xr9 | ||||
| #define D3     $xr10 | ||||
| #define D4     $xr11 | ||||
| #define D5     $xr12 | ||||
| #define D6     $xr13 | ||||
| #define D7     $xr15 | ||||
| 
 | ||||
|     PROLOGUE | ||||
| 
 | ||||
|     addi.d     $sp,  $sp,  -8 | ||||
|     SDARG      $r23, $sp,  0 | ||||
| 
 | ||||
|     move       TD,   DST | ||||
|     move       TS,   SRC | ||||
|     slli.d     TL,   LDA,  0x03 | ||||
|     slli.d     T0,   TL,   0x01 | ||||
|     srai.d     J,    N,    0x02 | ||||
|     beq        J,    ZERO, .L_N2 | ||||
| 
 | ||||
| .L_J1: /* J-- */ | ||||
|     move       S1,   TS | ||||
|     add.d      S2,   TS,   TL | ||||
|     srai.d     I,    M,    0x02 | ||||
|     add.d      S3,   S2,   TL | ||||
|     add.d      S4,   S2,   T0 | ||||
|     add.d      TS,   S3,   T0 | ||||
|     addi.d     J,    J,    -1 | ||||
|     beq        I,    ZERO, .L_I3 | ||||
| 
 | ||||
| .L_I1: /* I-- */ | ||||
|     xvld       U0,   S1,   0x00 | ||||
|     xvld       U1,   S2,   0x00 | ||||
|     xvld       U2,   S3,   0x00 | ||||
|     xvld       U3,   S4,   0x00 | ||||
| 
 | ||||
|     xvpackev.d D0,   U1,   U0 | ||||
|     xvpackod.d D1,   U1,   U0 | ||||
|     xvpackev.d D2,   U3,   U2 | ||||
|     xvpackod.d D3,   U3,   U2 | ||||
| 
 | ||||
|     xvand.v    U0,   D0,   D0 | ||||
|     xvpermi.q  D0,   D2,   0x02  // 0 | ||||
|     xvand.v    U1,   D1,   D1 | ||||
|     xvpermi.q  D1,   D3,   0x02  // 1 | ||||
|     xvpermi.q  D2,   U0,   0x31  // 2 | ||||
|     xvpermi.q  D3,   U1,   0x31  // 3 | ||||
| 
 | ||||
|     xvst       D0,   TD,   0x00 | ||||
|     xvst       D1,   TD,   0x20 | ||||
|     xvst       D2,   TD,   0x40 | ||||
|     xvst       D3,   TD,   0x60 | ||||
| 
 | ||||
|     addi.d     S1,   S1,   0x20 | ||||
|     addi.d     S2,   S2,   0x20 | ||||
|     addi.d     S3,   S3,   0x20 | ||||
|     addi.d     S4,   S4,   0x20 | ||||
|     addi.d     TD,   TD,   0x80 | ||||
| 
 | ||||
|     addi.d     I,    I,    -1 | ||||
|     blt        ZERO, I,    .L_I1 | ||||
| 
 | ||||
| .L_I3: | ||||
|     andi      I,     M,    0x03 | ||||
|     beq       I,     ZERO, .L_I0 | ||||
| 
 | ||||
| .L_II1: | ||||
|     fld.d     F0,    S1,  0x00 | ||||
|     fld.d     F1,    S2,  0x00 | ||||
|     fld.d     F2,    S3,  0x00 | ||||
|     fld.d     F3,    S4,  0x00 | ||||
| 
 | ||||
|     fst.d     F0,    TD,  0x00 | ||||
|     addi.d    S1,    S1,  0x08 | ||||
|     fst.d     F1,    TD,  0x08 | ||||
|     addi.d    S2,    S2,  0x08 | ||||
|     fst.d     F2,    TD,  0x10 | ||||
|     addi.d    S3,    S3,  0x08 | ||||
|     fst.d     F3,    TD,  0x18 | ||||
|     addi.d    S4,    S4,  0x08 | ||||
| 
 | ||||
|     addi.d    TD,    TD,  0x20 | ||||
|     addi.d    I,     I,   -1 | ||||
|     blt       ZERO,  I,   .L_II1 | ||||
| 
 | ||||
| .L_I0: | ||||
|     blt       ZERO,  J,   .L_J1 | ||||
| 
 | ||||
| .L_N2: | ||||
|     andi      J,     N,   0x02 | ||||
|     beq       ZERO,  J,   .L_N1 | ||||
| 
 | ||||
|     move       S1,   TS | ||||
|     add.d      S2,   TS,   TL | ||||
|     srai.d     I,    M,    0x02 | ||||
|     add.d      TS,   S2,   TL | ||||
|     beq        I,    ZERO, .L_2I3 | ||||
| 
 | ||||
| .L_2I1: /* I-- */ | ||||
|     xvld       U0,   S1,   0x00 | ||||
|     xvld       U1,   S2,   0x00 | ||||
| 
 | ||||
|     xvpackev.d D0,   U1,   U0 | ||||
|     xvpackod.d D1,   U1,   U0 | ||||
| 
 | ||||
|     xvand.v    U0,   D0,   D0 | ||||
|     xvpermi.q  D0,   D1,   0x02  // 0 | ||||
|     xvpermi.q  D1,   U0,   0x31  // 1 | ||||
| 
 | ||||
|     xvst       D0,   TD,   0x00 | ||||
|     xvst       D1,   TD,   0x20 | ||||
|     addi.d     S1,   S1,   0x20 | ||||
|     addi.d     S2,   S2,   0x20 | ||||
|     addi.d     TD,   TD,   0x40 | ||||
|     addi.d     I,    I,    -1 | ||||
|     blt        ZERO, I,    .L_2I1 | ||||
| 
 | ||||
| .L_2I3: | ||||
|     andi       I,    M,    0x03 | ||||
|     beq        ZERO, I,    .L_N1 | ||||
| 
 | ||||
| .L_2II1: /* I-- */ | ||||
|     fld.d      F0,   S1,   0x00 | ||||
|     fld.d      F1,   S2,   0x00 | ||||
|     fst.d      F0,   TD,   0x00 | ||||
|     addi.d     I,    I,    -1 | ||||
|     fst.d      F1,   TD,   0x08 | ||||
|     addi.d     S1,   S1,   0x08 | ||||
|     addi.d     S2,   S2,   0x08 | ||||
|     addi.d     TD,   TD,   0x10 | ||||
|     blt        ZERO, I,    .L_2II1 | ||||
| 
 | ||||
| .L_N1: | ||||
|     andi       J,    N,    0x01 | ||||
|     beq        ZERO, J,    .L_N0 | ||||
| 
 | ||||
|     move       S1,   TS | ||||
|     srai.d     I,    M,    0x02 | ||||
|     beq        ZERO, I,    .L_1I3 | ||||
| 
 | ||||
| .L_1I1: | ||||
|     xvld       U0,   S1,   0x00 | ||||
|     addi.d     S1,   S1,   0x20 | ||||
|     xvst       U0,   TD,   0x00 | ||||
|     addi.d     I,    I,    -1 | ||||
|     addi.d     TD,   TD,   0x20 | ||||
|     blt        ZERO, I,    .L_1I1 | ||||
| 
 | ||||
| .L_1I3: | ||||
|     andi       I,    M,    0x03 | ||||
|     beq        ZERO, I,    .L_N0 | ||||
| 
 | ||||
| .L_1II1: | ||||
|     fld.d      F0,   S1,   0x00 | ||||
|     addi.d     S1,   S1,   0x08 | ||||
|     fst.d      F0,   TD,   0x00 | ||||
|     addi.d     I,    I,    -1 | ||||
|     addi.d     TD,   TD,   0x08 | ||||
|     blt        ZERO, I,    .L_1II1 | ||||
| 
 | ||||
| .L_N0: | ||||
|     LDARG     $r23,  $sp, 0 | ||||
|     addi.d    $sp,   $sp, 8 | ||||
|     jirl      $r0,   $r1, 0x00 | ||||
| 
 | ||||
|     EPILOGUE | ||||
|  | @ -0,0 +1,710 @@ | |||
| /******************************************************************************* | ||||
| Copyright (c) 2021, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
| notice, this list of conditions and the following disclaimer. | ||||
| 2. Redistributions in binary form must reproduce the above copyright | ||||
| notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | ||||
| distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | ||||
| its contributors may be used to endorse or promote products | ||||
| derived from this software without specific prior written permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *******************************************************************************/ | ||||
| #define ASSEMBLER | ||||
| 
 | ||||
| #include "common.h" | ||||
| /* Function parameters */ | ||||
| #define M      $r4    // param 1: m | ||||
| #define N      $r5    // param 2: n | ||||
| #define SRC    $r6    // param 3: src | ||||
| #define LDA    $r7    // param 4: lda | ||||
| #define DST    $r8    // param 5: dst | ||||
| 
 | ||||
| #define I      $r9 | ||||
| #define J      $r10 | ||||
| #define S0     $r11 | ||||
| #define S1     $r12 | ||||
| #define S2     $r13 | ||||
| #define S3     $r14 | ||||
| #define S4     $r15 | ||||
| #define S5     $r16 | ||||
| #define S6     $r17 | ||||
| #define S7     $r18 | ||||
| #define S8     $r19 | ||||
| #define P0     $r20 | ||||
| #define P1     $r23 | ||||
| #define P2     $r24 | ||||
| #define P3     $r25 | ||||
| #define P4     $r26 | ||||
| #define P5     $r27 | ||||
| #define T0     $r28 | ||||
| #define T1     $r29 | ||||
| #define TL     $r7 | ||||
| #define ZERO   $r0 | ||||
| 
 | ||||
| #define F0     $f0 | ||||
| #define F1     $f1 | ||||
| #define F2     $f2 | ||||
| #define F3     $f3 | ||||
| #define F4     $f4 | ||||
| #define F5     $f5 | ||||
| #define F6     $f6 | ||||
| #define F7     $f7 | ||||
| /* LASX vectors */ | ||||
| #define U0     $xr0 | ||||
| #define U1     $xr1 | ||||
| #define U2     $xr2 | ||||
| #define U3     $xr3 | ||||
| #define U4     $xr4 | ||||
| #define U5     $xr5 | ||||
| #define U6     $xr6 | ||||
| #define U7     $xr7 | ||||
| 
 | ||||
|     PROLOGUE | ||||
| 
 | ||||
|     addi.d     $sp,    $sp,   -56 | ||||
|     SDARG      $r23,   $sp,   0 | ||||
|     SDARG      $r24,   $sp,   8 | ||||
|     SDARG      $r25,   $sp,   16 | ||||
|     SDARG      $r26,   $sp,   24 | ||||
|     SDARG      $r27,   $sp,   32 | ||||
|     SDARG      $r28,   $sp,   40 | ||||
|     SDARG      $r29,   $sp,   48 | ||||
| 
 | ||||
|     move       S0,     SRC | ||||
|     move       P0,     DST | ||||
| 
 | ||||
|     srai.d     T0,     N,     0x04 | ||||
|     srai.d     T1,     N,     0x03 | ||||
|     slli.d     T0,     T0,    0x04 | ||||
|     slli.d     T1,     T1,    0x03 | ||||
|     mul.d      P2,     M,     T0 | ||||
|     mul.d      P3,     M,     T1 | ||||
|     slli.d     P2,     P2,    0x03 | ||||
|     slli.d     P3,     P3,    0x03 | ||||
|     add.d      P2,     DST,   P2 | ||||
|     add.d      P3,     DST,   P3 | ||||
| 
 | ||||
|     srai.d     T0,     N,     0x02 | ||||
|     srai.d     T1,     N,     0x01 | ||||
|     slli.d     T0,     T0,    0x02 | ||||
|     slli.d     T1,     T1,    0x01 | ||||
|     mul.d      P4,     M,     T0 | ||||
|     mul.d      P5,     M,     T1 | ||||
|     slli.d     P4,     P4,    0x03 | ||||
|     slli.d     P5,     P5,    0x03 | ||||
|     add.d      P4,     DST,   P4 | ||||
|     add.d      P5,     DST,   P5 | ||||
| 
 | ||||
|     slli.d     TL,     LDA,   0x03 | ||||
|     srai.d     J,      M,     0x03 | ||||
|     slli.d     T0,     TL,    0x01 | ||||
|     slli.d     T1,     M,     0x07 | ||||
|     beq        ZERO,   J,     .L_M7 | ||||
| 
 | ||||
| .L_J1: /* J-- */ | ||||
|     move       S1,     S0 | ||||
|     add.d      S2,     S0,    TL | ||||
|     add.d      S3,     S1,    T0 | ||||
|     add.d      S4,     S2,    T0 | ||||
|     add.d      S5,     S3,    T0 | ||||
|     add.d      S6,     S4,    T0 | ||||
|     add.d      S7,     S5,    T0 | ||||
|     add.d      S8,     S6,    T0 | ||||
|     add.d      S0,     S7,    T0 | ||||
| 
 | ||||
|     move       P1,     P0 | ||||
|     addi.d     P0,     P0,    0x400 | ||||
| 
 | ||||
|     srai.d     I,      N,     0x04 | ||||
|     addi.d     J,      J,     -1 | ||||
|     beq        ZERO,   I,     .L_N15 | ||||
| 
 | ||||
| .L_I1: /* I-- */ | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S1,    0x20 | ||||
|     xvld       U2,     S1,    0x40 | ||||
|     xvld       U3,     S1,    0x60 | ||||
|     xvld       U4,     S2,    0x00 | ||||
|     xvld       U5,     S2,    0x20 | ||||
|     xvld       U6,     S2,    0x40 | ||||
|     xvld       U7,     S2,    0x60 | ||||
| 
 | ||||
|     xvst       U0,     P1,    0x00 | ||||
|     xvst       U1,     P1,    0x20 | ||||
|     xvst       U2,     P1,    0x40 | ||||
|     xvst       U3,     P1,    0x60 | ||||
|     xvst       U4,     P1,    0x80 | ||||
|     xvst       U5,     P1,    0xA0 | ||||
|     xvst       U6,     P1,    0xC0 | ||||
|     xvst       U7,     P1,    0xE0 | ||||
| 
 | ||||
|     xvld       U0,     S3,    0x00 | ||||
|     xvld       U1,     S3,    0x20 | ||||
|     xvld       U2,     S3,    0x40 | ||||
|     xvld       U3,     S3,    0x60 | ||||
|     xvld       U4,     S4,    0x00 | ||||
|     xvld       U5,     S4,    0x20 | ||||
|     xvld       U6,     S4,    0x40 | ||||
|     xvld       U7,     S4,    0x60 | ||||
| 
 | ||||
|     xvst       U0,     P1,    0x100 | ||||
|     xvst       U1,     P1,    0x120 | ||||
|     xvst       U2,     P1,    0x140 | ||||
|     xvst       U3,     P1,    0x160 | ||||
|     xvst       U4,     P1,    0x180 | ||||
|     xvst       U5,     P1,    0x1A0 | ||||
|     xvst       U6,     P1,    0x1C0 | ||||
|     xvst       U7,     P1,    0x1E0 | ||||
| 
 | ||||
|     xvld       U0,     S5,    0x00 | ||||
|     xvld       U1,     S5,    0x20 | ||||
|     xvld       U2,     S5,    0x40 | ||||
|     xvld       U3,     S5,    0x60 | ||||
|     xvld       U4,     S6,    0x00 | ||||
|     xvld       U5,     S6,    0x20 | ||||
|     xvld       U6,     S6,    0x40 | ||||
|     xvld       U7,     S6,    0x60 | ||||
| 
 | ||||
|     xvst       U0,     P1,    0x200 | ||||
|     xvst       U1,     P1,    0x220 | ||||
|     xvst       U2,     P1,    0x240 | ||||
|     xvst       U3,     P1,    0x260 | ||||
|     xvst       U4,     P1,    0x280 | ||||
|     xvst       U5,     P1,    0x2A0 | ||||
|     xvst       U6,     P1,    0x2C0 | ||||
|     xvst       U7,     P1,    0x2E0 | ||||
| 
 | ||||
|     xvld       U0,     S7,    0x00 | ||||
|     xvld       U1,     S7,    0x20 | ||||
|     xvld       U2,     S7,    0x40 | ||||
|     xvld       U3,     S7,    0x60 | ||||
|     xvld       U4,     S8,    0x00 | ||||
|     xvld       U5,     S8,    0x20 | ||||
|     xvld       U6,     S8,    0x40 | ||||
|     xvld       U7,     S8,    0x60 | ||||
| 
 | ||||
|     xvst       U0,     P1,    0x300 | ||||
|     xvst       U1,     P1,    0x320 | ||||
|     xvst       U2,     P1,    0x340 | ||||
|     xvst       U3,     P1,    0x360 | ||||
|     xvst       U4,     P1,    0x380 | ||||
|     xvst       U5,     P1,    0x3A0 | ||||
|     xvst       U6,     P1,    0x3C0 | ||||
|     xvst       U7,     P1,    0x3E0 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x80 | ||||
|     addi.d     S2,     S2,    0x80 | ||||
|     addi.d     S3,     S3,    0x80 | ||||
|     addi.d     S4,     S4,    0x80 | ||||
|     addi.d     S5,     S5,    0x80 | ||||
|     addi.d     S6,     S6,    0x80 | ||||
|     addi.d     S7,     S7,    0x80 | ||||
|     addi.d     S8,     S8,    0x80 | ||||
|     addi.d     I,      I,     -1 | ||||
|     add.d      P1,     P1,    T1 | ||||
|     blt        ZERO,   I,     .L_I1 | ||||
| 
 | ||||
| .L_N15: | ||||
|     andi       I,      N,     0x08 | ||||
|     beq        ZERO,   I,     .L_N7 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S1,    0x20 | ||||
|     xvld       U2,     S2,    0x00 | ||||
|     xvld       U3,     S2,    0x20 | ||||
|     xvld       U4,     S3,    0x00 | ||||
|     xvld       U5,     S3,    0x20 | ||||
|     xvld       U6,     S4,    0x00 | ||||
|     xvld       U7,     S4,    0x20 | ||||
| 
 | ||||
|     xvst       U0,     P2,    0x00 | ||||
|     xvst       U1,     P2,    0x20 | ||||
|     xvst       U2,     P2,    0x40 | ||||
|     xvst       U3,     P2,    0x60 | ||||
|     xvst       U4,     P2,    0x80 | ||||
|     xvst       U5,     P2,    0xA0 | ||||
|     xvst       U6,     P2,    0xC0 | ||||
|     xvst       U7,     P2,    0xE0 | ||||
| 
 | ||||
|     xvld       U0,     S5,    0x00 | ||||
|     xvld       U1,     S5,    0x20 | ||||
|     xvld       U2,     S6,    0x00 | ||||
|     xvld       U3,     S6,    0x20 | ||||
|     xvld       U4,     S7,    0x00 | ||||
|     xvld       U5,     S7,    0x20 | ||||
|     xvld       U6,     S8,    0x00 | ||||
|     xvld       U7,     S8,    0x20 | ||||
| 
 | ||||
|     xvst       U0,     P2,    0x100 | ||||
|     xvst       U1,     P2,    0x120 | ||||
|     xvst       U2,     P2,    0x140 | ||||
|     xvst       U3,     P2,    0x160 | ||||
|     xvst       U4,     P2,    0x180 | ||||
|     xvst       U5,     P2,    0x1A0 | ||||
|     xvst       U6,     P2,    0x1C0 | ||||
|     xvst       U7,     P2,    0x1E0 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x40 | ||||
|     addi.d     S2,     S2,    0x40 | ||||
|     addi.d     S3,     S3,    0x40 | ||||
|     addi.d     S4,     S4,    0x40 | ||||
|     addi.d     S5,     S5,    0x40 | ||||
|     addi.d     S6,     S6,    0x40 | ||||
|     addi.d     S7,     S7,    0x40 | ||||
|     addi.d     S8,     S8,    0x40 | ||||
|     addi.d     P2,     P2,    0x200 | ||||
| 
 | ||||
| .L_N7: | ||||
|     andi       I,      N,     0x04 | ||||
|     beq        ZERO,   I,     .L_N3 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S2,    0x00 | ||||
|     xvld       U2,     S3,    0x00 | ||||
|     xvld       U3,     S4,    0x00 | ||||
|     xvld       U4,     S5,    0x00 | ||||
|     xvld       U5,     S6,    0x00 | ||||
|     xvld       U6,     S7,    0x00 | ||||
|     xvld       U7,     S8,    0x00 | ||||
| 
 | ||||
|     xvst       U0,     P3,    0x00 | ||||
|     xvst       U1,     P3,    0x20 | ||||
|     xvst       U2,     P3,    0x40 | ||||
|     xvst       U3,     P3,    0x60 | ||||
|     xvst       U4,     P3,    0x80 | ||||
|     xvst       U5,     P3,    0xA0 | ||||
|     xvst       U6,     P3,    0xC0 | ||||
|     xvst       U7,     P3,    0xE0 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x20 | ||||
|     addi.d     S2,     S2,    0x20 | ||||
|     addi.d     S3,     S3,    0x20 | ||||
|     addi.d     S4,     S4,    0x20 | ||||
|     addi.d     S5,     S5,    0x20 | ||||
|     addi.d     S6,     S6,    0x20 | ||||
|     addi.d     S7,     S7,    0x20 | ||||
|     addi.d     S8,     S8,    0x20 | ||||
|     addi.d     P3,     P3,    0x100 | ||||
| 
 | ||||
| .L_N3: | ||||
|     andi       I,      N,     0x02 | ||||
|     beq        ZERO,   I,     .L_N1 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S2,    0x00 | ||||
|     xvld       U2,     S3,    0x00 | ||||
|     xvld       U3,     S4,    0x00 | ||||
|     xvld       U4,     S5,    0x00 | ||||
|     xvld       U5,     S6,    0x00 | ||||
|     xvld       U6,     S7,    0x00 | ||||
|     xvld       U7,     S8,    0x00 | ||||
| 
 | ||||
|     xvpermi.q  U0,     U1,    0x02 | ||||
|     xvpermi.q  U2,     U3,    0x02 | ||||
|     xvpermi.q  U4,     U5,    0x02 | ||||
|     xvpermi.q  U6,     U7,    0x02 | ||||
| 
 | ||||
|     xvst       U0,     P4,    0x00 | ||||
|     xvst       U2,     P4,    0x20 | ||||
|     xvst       U4,     P4,    0x40 | ||||
|     xvst       U6,     P4,    0x60 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x10 | ||||
|     addi.d     S2,     S2,    0x10 | ||||
|     addi.d     S3,     S3,    0x10 | ||||
|     addi.d     S4,     S4,    0x10 | ||||
|     addi.d     S5,     S5,    0x10 | ||||
|     addi.d     S6,     S6,    0x10 | ||||
|     addi.d     S7,     S7,    0x10 | ||||
|     addi.d     S8,     S8,    0x10 | ||||
|     addi.d     P4,     P4,    0x80 | ||||
| 
 | ||||
| .L_N1: | ||||
|     andi       I,      N,     0x01 | ||||
|     beq        ZERO,   I,     .L_N0 | ||||
| 
 | ||||
|     fld.d      F0,     S1,    0x00 | ||||
|     fld.d      F1,     S2,    0x00 | ||||
|     fld.d      F2,     S3,    0x00 | ||||
|     fld.d      F3,     S4,    0x00 | ||||
|     fld.d      F4,     S5,    0x00 | ||||
|     fld.d      F5,     S6,    0x00 | ||||
|     fld.d      F6,     S7,    0x00 | ||||
|     fld.d      F7,     S8,    0x00 | ||||
| 
 | ||||
|     fst.d      F0,     P5,    0x00 | ||||
|     fst.d      F1,     P5,    0x08 | ||||
|     fst.d      F2,     P5,    0x10 | ||||
|     fst.d      F3,     P5,    0x18 | ||||
|     fst.d      F4,     P5,    0x20 | ||||
|     fst.d      F5,     P5,    0x28 | ||||
|     fst.d      F6,     P5,    0x30 | ||||
|     fst.d      F7,     P5,    0x38 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x08 | ||||
|     addi.d     S2,     S2,    0x08 | ||||
|     addi.d     S3,     S3,    0x08 | ||||
|     addi.d     S4,     S4,    0x08 | ||||
|     addi.d     S5,     S5,    0x08 | ||||
|     addi.d     S6,     S6,    0x08 | ||||
|     addi.d     S7,     S7,    0x08 | ||||
|     addi.d     S8,     S8,    0x08 | ||||
|     addi.d     P5,     P5,    0x40 | ||||
| 
 | ||||
| .L_N0: | ||||
|     blt        ZERO,   J,     .L_J1 | ||||
| 
 | ||||
| .L_M7: | ||||
|     andi       J,      M,     0x04 | ||||
|     beq        ZERO,   J,     .L_M3 | ||||
| 
 | ||||
|     move       S1,     S0 | ||||
|     add.d      S2,     S0,    TL | ||||
|     add.d      S3,     S1,    T0 | ||||
|     add.d      S4,     S2,    T0 | ||||
|     add.d      S0,     S3,    T0 | ||||
| 
 | ||||
|     move       P1,     P0 | ||||
|     addi.d     P0,     P0,    0x200 | ||||
| 
 | ||||
|     srai.d     I,      N,     0x04 | ||||
|     beq        ZERO,   I,     .L_4N15 | ||||
| 
 | ||||
| .L_4I1: /* I-- */ | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S1,    0x20 | ||||
|     xvld       U2,     S1,    0x40 | ||||
|     xvld       U3,     S1,    0x60 | ||||
|     xvld       U4,     S2,    0x00 | ||||
|     xvld       U5,     S2,    0x20 | ||||
|     xvld       U6,     S2,    0x40 | ||||
|     xvld       U7,     S2,    0x60 | ||||
| 
 | ||||
|     xvst       U0,     P1,    0x00 | ||||
|     xvst       U1,     P1,    0x20 | ||||
|     xvst       U2,     P1,    0x40 | ||||
|     xvst       U3,     P1,    0x60 | ||||
|     xvst       U4,     P1,    0x80 | ||||
|     xvst       U5,     P1,    0xA0 | ||||
|     xvst       U6,     P1,    0xC0 | ||||
|     xvst       U7,     P1,    0xE0 | ||||
| 
 | ||||
|     xvld       U0,     S3,    0x00 | ||||
|     xvld       U1,     S3,    0x20 | ||||
|     xvld       U2,     S3,    0x40 | ||||
|     xvld       U3,     S3,    0x60 | ||||
|     xvld       U4,     S4,    0x00 | ||||
|     xvld       U5,     S4,    0x20 | ||||
|     xvld       U6,     S4,    0x40 | ||||
|     xvld       U7,     S4,    0x60 | ||||
| 
 | ||||
|     xvst       U0,     P1,    0x100 | ||||
|     xvst       U1,     P1,    0x120 | ||||
|     xvst       U2,     P1,    0x140 | ||||
|     xvst       U3,     P1,    0x160 | ||||
|     xvst       U4,     P1,    0x180 | ||||
|     xvst       U5,     P1,    0x1A0 | ||||
|     xvst       U6,     P1,    0x1C0 | ||||
|     xvst       U7,     P1,    0x1E0 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x80 | ||||
|     addi.d     S2,     S2,    0x80 | ||||
|     addi.d     S3,     S3,    0x80 | ||||
|     addi.d     S4,     S4,    0x80 | ||||
|     addi.d     I,      I,     -1 | ||||
|     add.d      P1,     P1,    T1 | ||||
|     blt        ZERO,   I,     .L_4I1 | ||||
| 
 | ||||
| .L_4N15: | ||||
|     andi       I,      N,     0x08 | ||||
|     beq        ZERO,   I,     .L_4N7 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S1,    0x20 | ||||
|     xvld       U2,     S2,    0x00 | ||||
|     xvld       U3,     S2,    0x20 | ||||
|     xvld       U4,     S3,    0x00 | ||||
|     xvld       U5,     S3,    0x20 | ||||
|     xvld       U6,     S4,    0x00 | ||||
|     xvld       U7,     S4,    0x20 | ||||
| 
 | ||||
|     xvst       U0,     P2,    0x00 | ||||
|     xvst       U1,     P2,    0x20 | ||||
|     xvst       U2,     P2,    0x40 | ||||
|     xvst       U3,     P2,    0x60 | ||||
|     xvst       U4,     P2,    0x80 | ||||
|     xvst       U5,     P2,    0xA0 | ||||
|     xvst       U6,     P2,    0xC0 | ||||
|     xvst       U7,     P2,    0xE0 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x40 | ||||
|     addi.d     S2,     S2,    0x40 | ||||
|     addi.d     S3,     S3,    0x40 | ||||
|     addi.d     S4,     S4,    0x40 | ||||
|     addi.d     P2,     P2,    0x100 | ||||
| 
 | ||||
| .L_4N7: | ||||
|     andi       I,      N,     0x04 | ||||
|     beq        ZERO,   I,     .L_4N3 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S2,    0x00 | ||||
|     xvld       U2,     S3,    0x00 | ||||
|     xvld       U3,     S4,    0x00 | ||||
| 
 | ||||
|     xvst       U0,     P3,    0x00 | ||||
|     xvst       U1,     P3,    0x20 | ||||
|     xvst       U2,     P3,    0x40 | ||||
|     xvst       U3,     P3,    0x60 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x20 | ||||
|     addi.d     S2,     S2,    0x20 | ||||
|     addi.d     S3,     S3,    0x20 | ||||
|     addi.d     S4,     S4,    0x20 | ||||
|     addi.d     P3,     P3,    0x80 | ||||
| 
 | ||||
| .L_4N3: | ||||
|     andi       I,      N,     0x02 | ||||
|     beq        ZERO,   I,     .L_4N1 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S2,    0x00 | ||||
|     xvld       U2,     S3,    0x00 | ||||
|     xvld       U3,     S4,    0x00 | ||||
| 
 | ||||
|     xvpermi.q  U0,     U1,    0x02 | ||||
|     xvpermi.q  U2,     U3,    0x02 | ||||
| 
 | ||||
|     xvst       U0,     P4,    0x00 | ||||
|     xvst       U2,     P4,    0x20 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x10 | ||||
|     addi.d     S2,     S2,    0x10 | ||||
|     addi.d     S3,     S3,    0x10 | ||||
|     addi.d     S4,     S4,    0x10 | ||||
|     addi.d     P4,     P4,    0x40 | ||||
| 
 | ||||
| .L_4N1: | ||||
|     andi        I,      N,     0x01 | ||||
|     beq         ZERO,   I,     .L_M3 | ||||
| 
 | ||||
|     fld.d      F0,     S1,    0x00 | ||||
|     fld.d      F1,     S2,    0x00 | ||||
|     fld.d      F2,     S3,    0x00 | ||||
|     fld.d      F3,     S4,    0x00 | ||||
| 
 | ||||
|     fst.d      F0,     P5,    0x00 | ||||
|     fst.d      F1,     P5,    0x08 | ||||
|     fst.d      F2,     P5,    0x10 | ||||
|     fst.d      F3,     P5,    0x18 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x08 | ||||
|     addi.d     S2,     S2,    0x08 | ||||
|     addi.d     S3,     S3,    0x08 | ||||
|     addi.d     S4,     S4,    0x08 | ||||
|     addi.d     P5,     P5,    0x20 | ||||
| 
 | ||||
| .L_M3: | ||||
|     andi       J,      M,     0x02 | ||||
|     beq        ZERO,   J,     .L_M1 | ||||
| 
 | ||||
|     move       S1,     S0 | ||||
|     add.d      S2,     S0,    TL | ||||
|     add.d      S0,     S0,    T0 | ||||
| 
 | ||||
|     move       P1,     P0 | ||||
|     addi.d     P0,     P0,    0x100 | ||||
| 
 | ||||
|     srai.d     I,      N,     0x04 | ||||
|     beq        ZERO,   I,     .L_2N15 | ||||
| 
 | ||||
| .L_2I1: /* I-- */ | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S1,    0x20 | ||||
|     xvld       U2,     S1,    0x40 | ||||
|     xvld       U3,     S1,    0x60 | ||||
|     xvld       U4,     S2,    0x00 | ||||
|     xvld       U5,     S2,    0x20 | ||||
|     xvld       U6,     S2,    0x40 | ||||
|     xvld       U7,     S2,    0x60 | ||||
| 
 | ||||
|     xvst       U0,     P1,    0x00 | ||||
|     xvst       U1,     P1,    0x20 | ||||
|     xvst       U2,     P1,    0x40 | ||||
|     xvst       U3,     P1,    0x60 | ||||
|     xvst       U4,     P1,    0x80 | ||||
|     xvst       U5,     P1,    0xA0 | ||||
|     xvst       U6,     P1,    0xC0 | ||||
|     xvst       U7,     P1,    0xE0 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x80 | ||||
|     addi.d     S2,     S2,    0x80 | ||||
|     addi.d     I,      I,     -1 | ||||
|     add.d      P1,     P1,    T1 | ||||
|     blt        ZERO,   I,     .L_2I1 | ||||
| 
 | ||||
| .L_2N15: | ||||
|     andi       I,      N,     0x08 | ||||
|     beq        ZERO,   I,     .L_2N7 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S1,    0x20 | ||||
|     xvld       U2,     S2,    0x00 | ||||
|     xvld       U3,     S2,    0x20 | ||||
| 
 | ||||
|     xvst       U0,     P2,    0x00 | ||||
|     xvst       U1,     P2,    0x20 | ||||
|     xvst       U2,     P2,    0x40 | ||||
|     xvst       U3,     P2,    0x60 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x40 | ||||
|     addi.d     S2,     S2,    0x40 | ||||
|     addi.d     P2,     P2,    0x80 | ||||
| 
 | ||||
| .L_2N7: | ||||
|     andi       I,      N,     0x04 | ||||
|     beq        ZERO,   I,     .L_2N3 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S2,    0x00 | ||||
| 
 | ||||
|     xvst       U0,     P3,    0x00 | ||||
|     xvst       U1,     P3,    0x20 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x20 | ||||
|     addi.d     S2,     S2,    0x20 | ||||
|     addi.d     P3,     P3,    0x40 | ||||
| 
 | ||||
| .L_2N3: | ||||
|     andi       I,      N,     0x02 | ||||
|     beq        ZERO,   I,     .L_2N1 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S2,    0x00 | ||||
| 
 | ||||
|     xvpermi.q  U0,     U1,    0x02 | ||||
| 
 | ||||
|     xvst       U0,     P4,    0x00 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x10 | ||||
|     addi.d     S2,     S2,    0x10 | ||||
|     addi.d     P4,     P4,    0x20 | ||||
| 
 | ||||
| .L_2N1: | ||||
|     andi       I,      N,     0x01 | ||||
|     beq        ZERO,   I,     .L_M1 | ||||
| 
 | ||||
|     fld.d      F0,     S1,    0x00 | ||||
|     fld.d      F1,     S2,    0x00 | ||||
| 
 | ||||
|     fst.d      F0,     P5,    0x00 | ||||
|     fst.d      F1,     P5,    0x08 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x08 | ||||
|     addi.d     S2,     S2,    0x08 | ||||
|     addi.d     P5,     P5,    0x10 | ||||
| 
 | ||||
| .L_M1: | ||||
|     andi       J,      M,     0x01 | ||||
|     beq        ZERO,   J,     .L_M0 | ||||
| 
 | ||||
|     move       S1,     S0 | ||||
|     add.d      S2,     S0,    TL | ||||
| 
 | ||||
|     move       P1,     P0 | ||||
|     addi.d     P0,     P0,    0x80 | ||||
| 
 | ||||
|     srai.d     I,      N,     0x04 | ||||
|     beq        ZERO,   I,     .L_1N15 | ||||
| 
 | ||||
| .L_1I1: /* I-- */ | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S1,    0x20 | ||||
|     xvld       U2,     S1,    0x40 | ||||
|     xvld       U3,     S1,    0x60 | ||||
| 
 | ||||
|     xvst       U0,     P1,    0x00 | ||||
|     xvst       U1,     P1,    0x20 | ||||
|     xvst       U2,     P1,    0x40 | ||||
|     xvst       U3,     P1,    0x60 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x80 | ||||
|     addi.d     I,      I,     -1 | ||||
|     add.d      P1,     P1,    T1 | ||||
|     blt        ZERO,   I,     .L_1I1 | ||||
| 
 | ||||
| .L_1N15: | ||||
|     andi       I,      N,     0x08 | ||||
|     beq        ZERO,   I,     .L_1N7 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S1,    0x20 | ||||
| 
 | ||||
|     xvst       U0,     P2,    0x00 | ||||
|     xvst       U1,     P2,    0x20 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x40 | ||||
|     addi.d     P2,     P2,    0x40 | ||||
| 
 | ||||
| .L_1N7: | ||||
|     andi       I,      N,     0x04 | ||||
|     beq        ZERO,   I,     .L_1N3 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
| 
 | ||||
|     xvst       U0,     P3,    0x00 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x20 | ||||
|     addi.d     P3,     P3,    0x20 | ||||
| 
 | ||||
| .L_1N3: | ||||
|     andi       I,      N,     0x02 | ||||
|     beq        ZERO,   I,     .L_1N1 | ||||
| 
 | ||||
|     fld.d      F0,     S1,    0x00 | ||||
|     fld.d      F1,     S1,    0x08 | ||||
| 
 | ||||
|     fst.d      F0,     P4,    0x00 | ||||
|     fst.d      F1,     P4,    0x08 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x10 | ||||
|     addi.d     P4,     P4,    0x10 | ||||
| 
 | ||||
| .L_1N1: | ||||
|     andi       I,      N,     0x01 | ||||
|     beq        ZERO,   I,     .L_M0 | ||||
| 
 | ||||
|     fld.d      F0,     S1,    0x00 | ||||
| 
 | ||||
|     fst.d      F0,     P5,    0x00 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x08 | ||||
|     addi.d     P5,     P5,    0x08 | ||||
| 
 | ||||
| .L_M0: | ||||
|     LDARG      $r23,   $sp,   0 | ||||
|     LDARG      $r24,   $sp,   8 | ||||
|     LDARG      $r25,   $sp,   16 | ||||
|     LDARG      $r26,   $sp,   24 | ||||
|     LDARG      $r27,   $sp,   32 | ||||
|     LDARG      $r28,   $sp,   40 | ||||
|     LDARG      $r29,   $sp,   48 | ||||
|     addi.d     $sp,    $sp,   56 | ||||
|     jirl       $r0,    $r1,   0x00 | ||||
| 
 | ||||
|     EPILOGUE | ||||
|  | @ -0,0 +1,270 @@ | |||
| /******************************************************************************* | ||||
| Copyright (c) 2021, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
| notice, this list of conditions and the following disclaimer. | ||||
| 2. Redistributions in binary form must reproduce the above copyright | ||||
| notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | ||||
| distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | ||||
| its contributors may be used to endorse or promote products | ||||
| derived from this software without specific prior written permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *******************************************************************************/ | ||||
| #define ASSEMBLER | ||||
| 
 | ||||
| #include "common.h" | ||||
| /* Function parameters */ | ||||
| #define M      $r4    // param 1: m | ||||
| #define N      $r5    // param 2: n | ||||
| #define SRC    $r6    // param 3: src | ||||
| #define LDA    $r7    // param 4: lda | ||||
| #define DST    $r8    // param 5: dst | ||||
| 
 | ||||
| #define I      $r9 | ||||
| #define J      $r10 | ||||
| #define S0     $r11 | ||||
| #define S1     $r12 | ||||
| #define S2     $r13 | ||||
| #define S3     $r14 | ||||
| #define S4     $r15 | ||||
| #define P0     $r16 | ||||
| #define P1     $r17 | ||||
| #define P2     $r18 | ||||
| #define P3     $r19 | ||||
| #define T0     $r20 | ||||
| #define T1     $r23 | ||||
| #define TL     $r7 | ||||
| #define ZERO   $r0 | ||||
| 
 | ||||
| #define F0     $f0 | ||||
| #define F1     $f1 | ||||
| #define F2     $f2 | ||||
| #define F3     $f3 | ||||
| /* LASX vectors */ | ||||
| #define U0     $xr0 | ||||
| #define U1     $xr1 | ||||
| #define U2     $xr2 | ||||
| #define U3     $xr3 | ||||
| 
 | ||||
|     PROLOGUE | ||||
| 
 | ||||
|     addi.d     $sp,    $sp,   -8 | ||||
|     SDARG      $r23,   $sp,   0 | ||||
| 
 | ||||
|     move       S0,     SRC | ||||
|     move       P0,     DST | ||||
| 
 | ||||
|     srai.d     T0,     N,     0x02 | ||||
|     slli.d     T0,     T0,    0x02 | ||||
|     srai.d     T1,     N,     0x01 | ||||
|     slli.d     T1,     T1,    0x01 | ||||
|     mul.d      T0,     M,     T0 | ||||
|     mul.d      T1,     M,     T1 | ||||
|     slli.d     T0,     T0,    0x03 | ||||
|     slli.d     T1,     T1,    0x03 | ||||
|     add.d      P2,     DST,   T0 | ||||
|     add.d      P3,     DST,   T1 | ||||
| 
 | ||||
|     slli.d     TL,     LDA,   0x03 | ||||
|     srai.d     J,      M,     0x02 | ||||
|     slli.d     T0,     TL,    0x01 | ||||
|     slli.d     T1,     M,     0x05 | ||||
|     beq        ZERO,   J,     .L_M3 | ||||
| 
 | ||||
| .L_J1: /* J-- */ | ||||
|     move       S1,     S0 | ||||
|     add.d      S2,     S0,    TL | ||||
|     add.d      S3,     S1,    T0 | ||||
|     add.d      S4,     S2,    T0 | ||||
|     add.d      S0,     S3,    T0 | ||||
| 
 | ||||
|     move       P1,     P0 | ||||
|     addi.d     P0,     P0,    0x80 | ||||
| 
 | ||||
|     srai.d     I,      N,     0x02 | ||||
|     addi.d     J,      J,     -1 | ||||
|     beq        ZERO,   I,     .L_N3 | ||||
| 
 | ||||
| .L_I1: /* I-- */ | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S2,    0x00 | ||||
|     xvld       U2,     S3,    0x00 | ||||
|     xvld       U3,     S4,    0x00 | ||||
| 
 | ||||
|     xvst       U0,     P1,    0x00 | ||||
|     xvst       U1,     P1,    0x20 | ||||
|     xvst       U2,     P1,    0x40 | ||||
|     xvst       U3,     P1,    0x60 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x20 | ||||
|     addi.d     S2,     S2,    0x20 | ||||
|     addi.d     S3,     S3,    0x20 | ||||
|     addi.d     S4,     S4,    0x20 | ||||
|     add.d      P1,     P1,    T1 | ||||
| 
 | ||||
|     addi.d     I,      I,    -1 | ||||
|     blt        ZERO,   I,    .L_I1 | ||||
| 
 | ||||
| .L_N3: | ||||
|     andi       I,      N,    0x02 | ||||
|     beq        ZERO,   I,    .L_N1 | ||||
| 
 | ||||
|     xvld       U0,     S1,    0x00 | ||||
|     xvld       U1,     S2,    0x00 | ||||
|     xvld       U2,     S3,    0x00 | ||||
|     xvld       U3,     S4,    0x00 | ||||
| 
 | ||||
|     xvpermi.q  U0,     U1,    0x02 | ||||
|     xvpermi.q  U2,     U3,    0x02 | ||||
| 
 | ||||
|     xvst       U0,     P2,    0x00 | ||||
|     xvst       U2,     P2,    0x20 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x10 | ||||
|     addi.d     S2,     S2,    0x10 | ||||
|     addi.d     S3,     S3,    0x10 | ||||
|     addi.d     S4,     S4,    0x10 | ||||
|     addi.d     P2,     P2,    0x40 | ||||
| 
 | ||||
| .L_N1: | ||||
|     andi       I,      N,     0x01 | ||||
|     beq        ZERO,   I,     .L_N0 | ||||
| 
 | ||||
|     fld.d      F0,     S1,    0x00 | ||||
|     fld.d      F1,     S2,    0x00 | ||||
|     fld.d      F2,     S3,    0x00 | ||||
|     fld.d      F3,     S4,    0x00 | ||||
| 
 | ||||
|     fst.d      F0,     P3,    0x00 | ||||
|     fst.d      F1,     P3,    0x08 | ||||
|     fst.d      F2,     P3,    0x10 | ||||
|     fst.d      F3,     P3,    0x18 | ||||
| 
 | ||||
|     addi.d     S1,     S1,    0x08 | ||||
|     addi.d     S2,     S2,    0x08 | ||||
|     addi.d     S3,     S3,    0x08 | ||||
|     addi.d     S4,     S4,    0x08 | ||||
|     addi.d     P3,     P3,    0x20 | ||||
| 
 | ||||
| .L_N0: | ||||
|     blt        ZERO,   J,     .L_J1 | ||||
| 
 | ||||
| .L_M3: | ||||
|     andi       J,      M,      0x02 | ||||
|     beq        ZERO,   J,      .L_M1 | ||||
| 
 | ||||
|     move       S1,     S0 | ||||
|     add.d      S2,     S0,     TL | ||||
|     add.d      S0,     S0,     T0 | ||||
| 
 | ||||
|     move       P1,     P0 | ||||
|     addi.d     P0,     P0,     0x40 | ||||
| 
 | ||||
|     srai.d     I,      N,      0x02 | ||||
|     beq        ZERO,   I,      .L_2N3 | ||||
| 
 | ||||
| .L_2I1:   /* I-- */ | ||||
|     xvld       U0,     S1,     0x00 | ||||
|     xvld       U1,     S2,     0x00 | ||||
| 
 | ||||
|     xvst       U0,     P1,     0x00 | ||||
|     xvst       U1,     P1,     0x20 | ||||
| 
 | ||||
|     addi.d     S1,     S1,     0x20 | ||||
|     addi.d     S2,     S2,     0x20 | ||||
|     addi.d     I,      I,      -1 | ||||
|     add.d      P1,     P1,     T1 | ||||
| 
 | ||||
|     blt        ZERO,   I,     .L_2I1 | ||||
| 
 | ||||
| .L_2N3: | ||||
|     andi       I,      N,     0x02 | ||||
|     beq        ZERO,   I,     .L_2N1 | ||||
| 
 | ||||
|     xvld       U0,     S1,     0x00 | ||||
|     xvld       U1,     S2,     0x00 | ||||
| 
 | ||||
|     xvpermi.q  U0,     U1,     0x02 | ||||
| 
 | ||||
|     xvst       U0,     P2,     0x00 | ||||
| 
 | ||||
|     addi.d     S1,     S1,     0x10 | ||||
|     addi.d     S2,     S2,     0x10 | ||||
|     addi.d     P2,     P2,     0x20 | ||||
| 
 | ||||
| .L_2N1: | ||||
|     addi.d     I,      N,      0x01 | ||||
|     beq        ZERO,   I,      .L_M1 | ||||
| 
 | ||||
|     fld.d      F0,     S1,     0x00 | ||||
|     fld.d      F1,     S2,     0x00 | ||||
| 
 | ||||
|     fst.d      F0,     P3,     0x00 | ||||
|     fst.d      F1,     P3,     0x08 | ||||
| 
 | ||||
|     addi.d     S1,     S1,     0x08 | ||||
|     addi.d     S2,     S2,     0x08 | ||||
|     addi.d     P3,     P3,     0x10 | ||||
| 
 | ||||
| .L_M1: | ||||
|     andi       J,      M,      0x01 | ||||
|     beq        ZERO,   J,      .L_M0 | ||||
| 
 | ||||
|     move       S1,     S0 | ||||
|     move       P1,     P0 | ||||
| 
 | ||||
|     srai.d     I,      N,      0x02 | ||||
|     beq        ZERO,   I,      .L_1N3 | ||||
| 
 | ||||
| .L_1I1: | ||||
|     xvld       U0,    S1,      0x00 | ||||
| 
 | ||||
|     xvst       U0,    P1,      0x00 | ||||
| 
 | ||||
|     addi.d     S1,    S1,      0x20 | ||||
|     addi.d     I,     I,       -1 | ||||
|     add.d      P1,    P1,      T1 | ||||
| 
 | ||||
|     blt        ZERO,  I,       .L_1I1 | ||||
| 
 | ||||
| .L_1N3: | ||||
|     andi       I,     N,       0x02 | ||||
|     beq        I,     ZERO,    .L_1N1 | ||||
| 
 | ||||
|     fld.d      F0,    S1,      0x00 | ||||
|     fld.d      F1,    S1,      0x08 | ||||
| 
 | ||||
|     fst.d      F0,    P2,      0x00 | ||||
|     fst.d      F1,    P2,      0x08 | ||||
| 
 | ||||
|     addi.d     S1,    S1,      0x10 | ||||
|     addi.d     P2,    P2,      0x10 | ||||
| 
 | ||||
| .L_1N1: | ||||
|     andi       I,     N,       0x01 | ||||
|     beq        I,     ZERO,    .L_M0 | ||||
| 
 | ||||
|     fld.d      F0,    S1,      0x00 | ||||
| 
 | ||||
|     fst.d      F0,    P3,      0x00 | ||||
| 
 | ||||
| .L_M0: | ||||
|     LDARG      $r23,   $sp,   0 | ||||
|     addi.d     $sp,    $sp,   8 | ||||
|     jirl       $r0,    $r1,   0x00 | ||||
| 
 | ||||
|     EPILOGUE | ||||
|  | @ -39,11 +39,19 @@ IZAMINKERNEL = izamax.S | |||
| endif | ||||
| 
 | ||||
| ifndef ISMINKERNEL | ||||
| ISMINKERNEL = iamax.S | ||||
| ISMINKERNEL = imax.S | ||||
| endif | ||||
| 
 | ||||
| ifndef IDMINKERNEL | ||||
| IDMINKERNEL = iamax.S | ||||
| IDMINKERNEL = imax.S | ||||
| endif | ||||
| 
 | ||||
| ifndef ISMAXKERNEL | ||||
| ISMAXKERNEL = imax.S | ||||
| endif | ||||
| 
 | ||||
| ifndef IDMAXKERNEL | ||||
| IDMAXKERNEL = imax.S | ||||
| endif | ||||
| 
 | ||||
| ifndef SNRM2KERNEL | ||||
|  |  | |||
|  | @ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
|         mode = BLAS_DOUBLE | BLAS_COMPLEX; | ||||
| #endif | ||||
|         blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,  | ||||
|                 NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | ||||
|                 NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); | ||||
|         ptr = (FLOAT *)result; | ||||
|         for (i = 0; i < nthreads; i++) { | ||||
|             sumf += (*ptr); | ||||
|  |  | |||
|  | @ -114,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| #else | ||||
|         mode = BLAS_DOUBLE | BLAS_REAL; | ||||
| #endif | ||||
|         blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | ||||
|         blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); | ||||
|         ptr = (FLOAT *)result; | ||||
|         for (i = 0; i < nthreads; i++) { | ||||
|             sumf += (*ptr); | ||||
|  |  | |||
|  | @ -190,7 +190,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | ||||
| 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | ||||
| 				   x, inc_x, y, inc_y, result, 0, | ||||
| 				   ( void *)dot_thread_function, nthreads); | ||||
| 				    (int (*)(void)) dot_thread_function, nthreads); | ||||
| 
 | ||||
| 		ptr = (RETURN_TYPE *)result; | ||||
| 		for (i = 0; i < nthreads; i++) { | ||||
|  |  | |||
|  | @ -196,7 +196,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| #else | ||||
| 	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; | ||||
| #endif | ||||
| 	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); | ||||
| 	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); | ||||
|     } | ||||
| #else	 | ||||
|     rot_compute(n, x, inc_x, y, inc_y, c, s); | ||||
|  |  | |||
|  | @ -123,7 +123,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| #else | ||||
|         mode = BLAS_DOUBLE | BLAS_REAL; | ||||
| #endif | ||||
|         blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | ||||
|         blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); | ||||
|         ptr = (FLOAT *)result; | ||||
|         for (i = 0; i < nthreads; i++) { | ||||
|             sumf += (*ptr); | ||||
|  |  | |||
|  | @ -198,7 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| #else | ||||
| 	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; | ||||
| #endif | ||||
| 	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); | ||||
| 	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); | ||||
|     } | ||||
| #else	 | ||||
|     rot_compute(n, x, inc_x, y, inc_y, c, s); | ||||
|  |  | |||
|  | @ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
|         mode = BLAS_DOUBLE | BLAS_COMPLEX; | ||||
| #endif | ||||
|         blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,  | ||||
|                 NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | ||||
|                 NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); | ||||
|         ptr = (FLOAT *)result; | ||||
|         for (i = 0; i < nthreads; i++) { | ||||
|             sumf += (*ptr); | ||||
|  |  | |||
|  | @ -215,7 +215,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| 
 | ||||
| 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | ||||
| 				   x, inc_x, y, inc_y, result, 0, | ||||
| 				   ( void *)zdot_thread_function, nthreads); | ||||
| 				   (int (*)(void))zdot_thread_function, nthreads); | ||||
| 
 | ||||
| 		ptr = (OPENBLAS_COMPLEX_FLOAT *)result; | ||||
| 		for (i = 0; i < nthreads; i++) { | ||||
|  |  | |||
|  | @ -97,8 +97,6 @@ | |||
| *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | ||||
| * | ||||
| *> \date December 2016 | ||||
| * | ||||
| *> \ingroup complexGEcomputational | ||||
| * | ||||
| *> \par Further Details: | ||||
|  | @ -127,10 +125,9 @@ | |||
| *  ===================================================================== | ||||
|       SUBROUTINE CGEQRT2( M, N, A, LDA, T, LDT, INFO ) | ||||
| * | ||||
| *  -- LAPACK computational routine (version 3.7.0) -- | ||||
| *  -- LAPACK computational routine -- | ||||
| *  -- LAPACK is a software package provided by Univ. of Tennessee,    -- | ||||
| *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | ||||
| *     December 2016 | ||||
| * | ||||
| *     .. Scalar Arguments .. | ||||
|       INTEGER   INFO, LDA, LDT, M, N | ||||
|  | @ -157,10 +154,10 @@ | |||
| *     Test the input arguments | ||||
| * | ||||
|       INFO = 0 | ||||
|       IF( M.LT.0 ) THEN | ||||
|          INFO = -1 | ||||
|       ELSE IF( N.LT.0 ) THEN | ||||
|       IF( N.LT.0 ) THEN | ||||
|          INFO = -2 | ||||
|       ELSE IF( M.LT.N ) THEN | ||||
|          INFO = -1 | ||||
|       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN | ||||
|          INFO = -4 | ||||
|       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN | ||||
|  |  | |||
|  | @ -97,8 +97,6 @@ | |||
| *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | ||||
| * | ||||
| *> \date December 2016 | ||||
| * | ||||
| *> \ingroup doubleGEcomputational | ||||
| * | ||||
| *> \par Further Details: | ||||
|  | @ -127,10 +125,9 @@ | |||
| *  ===================================================================== | ||||
|       SUBROUTINE DGEQRT2( M, N, A, LDA, T, LDT, INFO ) | ||||
| * | ||||
| *  -- LAPACK computational routine (version 3.7.0) -- | ||||
| *  -- LAPACK computational routine -- | ||||
| *  -- LAPACK is a software package provided by Univ. of Tennessee,    -- | ||||
| *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | ||||
| *     December 2016 | ||||
| * | ||||
| *     .. Scalar Arguments .. | ||||
|       INTEGER   INFO, LDA, LDT, M, N | ||||
|  | @ -157,10 +154,10 @@ | |||
| *     Test the input arguments | ||||
| * | ||||
|       INFO = 0 | ||||
|       IF( M.LT.0 ) THEN | ||||
|          INFO = -1 | ||||
|       ELSE IF( N.LT.0 ) THEN | ||||
|       IF( N.LT.0 ) THEN | ||||
|          INFO = -2 | ||||
|       ELSE IF( M.LT.N ) THEN | ||||
|          INFO = -1 | ||||
|       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN | ||||
|          INFO = -4 | ||||
|       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN | ||||
|  |  | |||
|  | @ -97,8 +97,6 @@ | |||
| *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | ||||
| * | ||||
| *> \date December 2016 | ||||
| * | ||||
| *> \ingroup realGEcomputational | ||||
| * | ||||
| *> \par Further Details: | ||||
|  | @ -127,10 +125,9 @@ | |||
| *  ===================================================================== | ||||
|       SUBROUTINE SGEQRT2( M, N, A, LDA, T, LDT, INFO ) | ||||
| * | ||||
| *  -- LAPACK computational routine (version 3.7.0) -- | ||||
| *  -- LAPACK computational routine -- | ||||
| *  -- LAPACK is a software package provided by Univ. of Tennessee,    -- | ||||
| *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | ||||
| *     December 2016 | ||||
| * | ||||
| *     .. Scalar Arguments .. | ||||
|       INTEGER   INFO, LDA, LDT, M, N | ||||
|  | @ -157,10 +154,10 @@ | |||
| *     Test the input arguments | ||||
| * | ||||
|       INFO = 0 | ||||
|       IF( M.LT.0 ) THEN | ||||
|          INFO = -1 | ||||
|       ELSE IF( N.LT.0 ) THEN | ||||
|       IF( N.LT.0 ) THEN | ||||
|          INFO = -2 | ||||
|       ELSE IF( M.LT.N ) THEN | ||||
|          INFO = -1 | ||||
|       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN | ||||
|          INFO = -4 | ||||
|       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN | ||||
|  |  | |||
|  | @ -97,8 +97,6 @@ | |||
| *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | ||||
| * | ||||
| *> \date December 2016 | ||||
| * | ||||
| *> \ingroup complex16GEcomputational | ||||
| * | ||||
| *> \par Further Details: | ||||
|  | @ -127,10 +125,9 @@ | |||
| *  ===================================================================== | ||||
|       SUBROUTINE ZGEQRT2( M, N, A, LDA, T, LDT, INFO ) | ||||
| * | ||||
| *  -- LAPACK computational routine (version 3.7.0) -- | ||||
| *  -- LAPACK computational routine -- | ||||
| *  -- LAPACK is a software package provided by Univ. of Tennessee,    -- | ||||
| *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | ||||
| *     December 2016 | ||||
| * | ||||
| *     .. Scalar Arguments .. | ||||
|       INTEGER   INFO, LDA, LDT, M, N | ||||
|  | @ -157,10 +154,10 @@ | |||
| *     Test the input arguments | ||||
| * | ||||
|       INFO = 0 | ||||
|       IF( M.LT.0 ) THEN | ||||
|          INFO = -1 | ||||
|       ELSE IF( N.LT.0 ) THEN | ||||
|       IF( N.LT.0 ) THEN | ||||
|          INFO = -2 | ||||
|       ELSE IF( M.LT.N ) THEN | ||||
|          INFO = -1 | ||||
|       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN | ||||
|          INFO = -4 | ||||
|       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN | ||||
|  |  | |||
|  | @ -66,6 +66,7 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \ | |||
| endif | ||||
| 
 | ||||
| .PHONY: all | ||||
| .NOTPARALLEL: | ||||
| all: $(TMGLIB) | ||||
| 
 | ||||
| ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \
 | ||||
|  |  | |||
|  | @ -662,7 +662,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| 
 | ||||
|     blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, | ||||
| 		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, | ||||
| 		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); | ||||
| 		       ipiv, 1, (int (*)(void))LASWP_PLUS, args -> nthreads); | ||||
| 
 | ||||
|     is += bk; | ||||
|   } | ||||
|  |  | |||
|  | @ -57,10 +57,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG | |||
|   a--; | ||||
|   k1 --; | ||||
| 
 | ||||
| #ifndef MINUS | ||||
|  ipiv += k1; | ||||
| #else | ||||
|   ipiv -= (k2 - 1) * incx; | ||||
| #ifdef MINUS | ||||
|   ipiv -= (k2 - k1 - 1) * incx; | ||||
| #endif | ||||
| 
 | ||||
|   if (n  <= 0) return 0; | ||||
|  |  | |||
|  | @ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG | |||
|   a--; | ||||
|   k1 --; | ||||
| 
 | ||||
| #ifndef MINUS | ||||
|  ipiv += k1; | ||||
| #else | ||||
|   ipiv -= (k2 - 1) * incx; | ||||
| #ifdef MINUS | ||||
|   ipiv -= (k2 - k1 - 1) * incx; | ||||
| #endif | ||||
| 
 | ||||
|   if (n  <= 0) return 0; | ||||
|  |  | |||
|  | @ -65,10 +65,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG | |||
|   a--; | ||||
|   k1 --; | ||||
| 
 | ||||
| #ifndef MINUS | ||||
|  ipiv += k1; | ||||
| #else | ||||
|   ipiv -= (k2 - 1) * incx; | ||||
| #ifdef MINUS | ||||
|   ipiv -= (k2 - k1 - 1) * incx; | ||||
| #endif | ||||
| 
 | ||||
|   if (n  <= 0) return 0; | ||||
|  |  | |||
|  | @ -78,10 +78,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG | |||
|   a--; | ||||
|   k1 --; | ||||
| 
 | ||||
| #ifndef MINUS | ||||
|   ipiv += k1; | ||||
| #else | ||||
|   ipiv -= (k2 - 1) * incx; | ||||
| #ifdef MINUS | ||||
|   ipiv -= (k2 - k1 - 1) * incx; | ||||
| #endif | ||||
| 
 | ||||
|   if (n  <= 0) return 0; | ||||
|  |  | |||
|  | @ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, | |||
|   lda *= 2; | ||||
|   k1 --; | ||||
| 
 | ||||
| #ifndef MINUS | ||||
|  ipiv += k1; | ||||
| #else | ||||
|   ipiv -= (k2 - 1) * incx; | ||||
| #ifdef MINUS | ||||
|   ipiv -= (k2 - k1 - 1) * incx; | ||||
| #endif | ||||
| 
 | ||||
|   if (n  <= 0) return 0; | ||||
|  |  | |||
|  | @ -60,10 +60,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, | |||
|   lda *= 2; | ||||
|   k1 --; | ||||
| 
 | ||||
| #ifndef MINUS | ||||
|  ipiv += k1; | ||||
| #else | ||||
|   ipiv -= (k2 - 1) * incx; | ||||
| #ifdef MINUS | ||||
|   ipiv -= (k2 - k1 - 1) * incx; | ||||
| #endif | ||||
| 
 | ||||
|   if (n  <= 0) return 0; | ||||
|  |  | |||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue