Merge pull request #8 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2021-01-20 15:38:30 +01:00 committed by GitHub
commit 3612d9a57a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 310 additions and 21 deletions

View File

@ -44,6 +44,11 @@ jobs:
if: github.event_name != 'pull_request'
run: brew update || true
- name: unlink installed gcc to allow updating
run: |
brew unlink gcc@8
brew unlink gcc@9
- name: Install prerequisites
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas

View File

@ -1,4 +1,4 @@
ifneq ($(C_COMPILER), PGI)
ifeq ($(CORE), ARMV8)
CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a
@ -77,4 +77,4 @@ CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif
endif
endif

View File

@ -1279,6 +1279,10 @@ CCOMMON_OPT += -DUSE_PAPI
EXTRALIB += -lpapi -lperfctr
endif
ifdef BUFFERSIZE
CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE)
endif
ifdef DYNAMIC_THREADS
CCOMMON_OPT += -DDYNAMIC_THREADS
endif

View File

@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx,
void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_crotg(void *a, void *b, float *c, void *s);
void cblas_zrotg(void *a, void *b, double *c, void *s);
void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);

View File

@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN)
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER)
set (CMAKE_MATCH_1 CMAKE_C_COMPILER)
endif ()
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
# message (STATUS "condition is true")
set (IfElse 1)

View File

@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define INLINE inline
#ifdef F_INTERFACE_FLANG
#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
#define RETURN_BY_STACK
#else
#define RETURN_BY_COMPLEX

View File

@ -1436,6 +1436,15 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 7: // Rocket Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
}
@ -2014,6 +2023,19 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
case 7:// Rocket Lake
#ifndef NO_AVX512
if(support_avx512())
return CORE_SKYLAKEX;
#endif
#ifndef NO_AVX2
if(support_avx2())
return CORE_HASWELL;
#endif
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
case 5:
switch (model) {

View File

@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){
}
}
case 10:
if (model == 5 || model == 6) {
if (model == 5 || model == 6) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
@ -666,7 +666,20 @@ static gotoblas_t *get_coretype(void){
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
}
if (model == 7) {
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:

View File

@ -68,7 +68,7 @@ extern void openblas_warning(int verbose, const char * msg);
#endif
#define get_cpu_ftr(id, var) ({ \
__asm__("mrs %0, "#id : "=r" (var)); \
__asm__ __volatile__("mrs %0, "#id : "=r" (var)); \
})
static char *corename[] = {

View File

@ -316,7 +316,7 @@ CCBLAS1OBJS = \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
cblas_caxpby.$(SUFFIX) \
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
CCBLAS2OBJS = \
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@ -346,7 +346,7 @@ CZBLAS1OBJS = \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX) \
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
CZBLAS2OBJS = \
@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c
cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
ifeq ($(BUILD_BFLOAT16),1)
cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

View File

@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
endif ()
# Makefile.L3
set(USE_TRMM false)
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE))
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE))
set(USE_TRMM true)
endif ()
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
set(USE_TRMM true)
endif ()

View File

@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
dot[0]=0.0;
dot[1]=0.0;
#if !defined(__PPC__) && !defined(__SunOS)
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
CREAL(result) = 0.0 ;
CIMAG(result) = 0.0 ;
#else
@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
i++ ;
}
#if !defined(__PPC__) && !defined(__SunOS)
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
#else

View File

@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S

View File

@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
SDOTKERNEL = ../generic/dot.c
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S

View File

@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S

View File

@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S
SDOTKERNEL = dot_thunderx.c
DDOTKERNEL = ddot_thunderx.c
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S

View File

@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S
SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S

View File

@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c
SDOTKERNEL = sdot_power10.c
DDOTKERNEL = ddot_power10.c
DSDOTKERNEL = sdot_power10.c
ifneq ($(GCCVERSIONGTEQ9),1)
CDOTKERNEL = cdot_power9.S
else
CDOTKERNEL = cdot.c
endif
ZDOTKERNEL = zdot.c
#
SNRM2KERNEL = ../arm/nrm2.c

View File

@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#include "common.h"
#if defined(POWER10)
#include "cdot_microk_power10.c"
#else
#ifndef HAVE_KERNEL_8
#include <altivec.h>
@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
}
#endif
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
if ((inc_x == 1) && (inc_y == 1)) {
#if defined(POWER10)
BLASLONG n1 = n & -16;
#else
BLASLONG n1 = n & -8;
#endif
BLASLONG j=0;
if (n1){

View File

@ -0,0 +1,177 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
{
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
__asm__
(
"dcbt 0, %2 \n\t"
"dcbt 0, %3 \n\t"
"xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 46, 96(%2) \n\t"
"lxvp 48, 0(%3) \n\t"
"lxvp 50, 32(%3) \n\t"
"lxvp 52, 64(%3) \n\t"
"lxvp 54, 96(%3) \n\t"
"xxperm 56, 48, %x7 \n\t"
"xxperm 57, 49, %x7 \n\t"
"xxperm 58, 50, %x7 \n\t"
"xxperm 59, 51, %x7 \n\t"
"xxperm 60, 52, %x7 \n\t"
"xxperm 61, 53, %x7 \n\t"
"xxperm 62, 54, %x7 \n\t"
"xxperm 63, 55, %x7 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
"lxvp 48, 0(%3) \n\t"
"xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
"lxvp 50, 32(%3) \n\t"
"xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r
"lxvp 40, 0(%2) \n\t"
"xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvp 42, 32(%2) \n\t"
"xxperm 56, 48, %x7 \n\t"
"xxperm 57, 49, %x7 \n\t"
"xxperm 58, 50, %x7 \n\t"
"xxperm 59, 51, %x7 \n\t"
"xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i
"lxvp 52, 64(%3) \n\t"
"xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i
"lxvp 54, 96(%3) \n\t"
"xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
"lxvp 44, 64(%2) \n\t"
"xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvp 46, 96(%2) \n\t"
"xxperm 60, 52, %x7 \n\t"
"xxperm 61, 53, %x7 \n\t"
"xxperm 62, 54, %x7 \n\t"
"xxperm 63, 55, %x7 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
"xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
"xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r
"xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r
"xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i
"xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i
"xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
"xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
"xvaddsp 32, 32, 34 \n\t"
"xvaddsp 36, 36, 38 \n\t"
"xvaddsp 33, 33, 35 \n\t"
"xvaddsp 37, 37, 39 \n\t"
"xvaddsp 35, 32, 36 \n\t"
"xvaddsp 34, 33, 37 \n\t"
"xxswapd 32, 35 \n\t"
"xxswapd 33, 34 \n\t"
"xvaddsp 35, 35, 32 \n\t"
"xvaddsp 34, 34, 33 \n\t"
"xxpermdi 34, 34, 35, 2 \n\t"
"stxv 34, 0(%6) \n\t"
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"
:
"=m" (*dot),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"m" (*x),
"m" (*y),
"b" (dot), // 6
"wa" (mask)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
);
}

View File

@ -2399,6 +2399,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 65536
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define SWITCH_RATIO 16
#define GEMM_PREFERED_SIZE 16
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 16
@ -2435,6 +2438,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 65536
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define SWITCH_RATIO 16
#define GEMM_PREFERED_SIZE 16
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 8