Commit 51902231 authored by Victor Yu's avatar Victor Yu

Merge branch 'elpa_avx512_knl' into 'master'

Fix ELPA2 AVX512 kernels on KNL

See merge request elsi-devel/elsi-interface!235
parents 65a4ec4b ff805a53
......@@ -7,7 +7,7 @@ SET(elsi_URL "http://elsi-interchange.org")
SET(elsi_EMAIL "elsi-team@duke.edu")
SET(elsi_LICENSE "BSD 3")
SET(elsi_DESCRIPTION "Electronic Structure Infrastructure")
SET(elsi_DATESTAMP "20200615")
SET(elsi_DATESTAMP "20200617")
### CMake modules ###
LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
......
......@@ -192,8 +192,8 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_double(double co
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -342,8 +342,8 @@ static __forceinline void hh_trafo_complex_kernel_20_AVX512_1hv_double(double co
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -471,8 +471,8 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_double(double co
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -582,8 +582,8 @@ static __forceinline void hh_trafo_complex_kernel_12_AVX512_1hv_double(double co
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -669,8 +669,8 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_double(double com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -737,8 +737,8 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX512_1hv_double(double com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......
......@@ -191,8 +191,8 @@ static __forceinline void hh_trafo_complex_kernel_48_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -341,8 +341,8 @@ static __forceinline void hh_trafo_complex_kernel_40_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -470,8 +470,8 @@ static __forceinline void hh_trafo_complex_kernel_32_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -581,8 +581,8 @@ static __forceinline void hh_trafo_complex_kernel_24_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
......@@ -668,8 +668,8 @@ static __forceinline void hh_trafo_complex_kernel_16_AVX512_1hv_single(float com
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......@@ -736,8 +736,8 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX512_1hv_single(float comp
h1_real = _AVX512_SET1(hh_dbl[0]);
h1_imag = _AVX512_SET1(hh_dbl[1]);
h1_real = _AVX512_XOR(h1_real, sign);
h1_imag = _AVX512_XOR(h1_imag, sign);
h1_real = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_real, (__m512i) sign);
h1_imag = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
tmp1 = _AVX512_MUL(h1_imag, x1);
x1 = _AVX512_FMADDSUB(h1_real, x1, _AVX512_SHUFFLE(tmp1, tmp1, _SHUFFLE));
......
......@@ -61,6 +61,7 @@
#define _AVX512_ADD _mm512_add_pd
#define _AVX512_MUL _mm512_mul_pd
#define _AVX512_XOR _mm512_xor_pd
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _mm512_FMA_pd(a,b,c) _mm512_fmadd_pd(a,b,c)
......@@ -194,13 +195,13 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
h1 = _AVX512_XOR(tau1, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau1, (__m512i) sign);
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
x3 = _AVX512_MUL(x3, h1);
x4 = _AVX512_MUL(x4, h1);
h1 = _AVX512_XOR(tau2, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau2, (__m512i) sign);
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
......@@ -339,13 +340,13 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
h1 = _AVX512_XOR(tau1, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau1, (__m512i) sign);
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
x3 = _AVX512_MUL(x3, h1);
h1 = _AVX512_XOR(tau2, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau2, (__m512i) sign);
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
......@@ -461,10 +462,10 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
h1 = _AVX512_XOR(tau1, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau1, (__m512i) sign);
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
h1 = _AVX512_XOR(tau2, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau2, (__m512i) sign);
h2 = _AVX512_MUL(h1, vs);
......@@ -557,11 +558,11 @@ void double_hh_trafo_real_avx512_2hv_double(double* q, double* hh, int* pnb, int
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
h1 = _AVX512_XOR(tau1, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau1, (__m512i) sign);
x1 = _AVX512_MUL(x1, h1);
h1 = _AVX512_XOR(tau2, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau2, (__m512i) sign);
h2 = _AVX512_MUL(h1, vs);
......
......@@ -61,6 +61,7 @@
#define _AVX512_ADD _mm512_add_ps
#define _AVX512_MUL _mm512_mul_ps
#define _AVX512_XOR _mm512_xor_ps
#define _AVX512_XOR_EPI _mm512_xor_epi64
#define _mm512_FMA_ps(a,b,c) _mm512_fmadd_ps(a,b,c)
......@@ -194,13 +195,13 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
h1 = _AVX512_XOR(tau1, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau1, (__m512i) sign);
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
x3 = _AVX512_MUL(x3, h1);
x4 = _AVX512_MUL(x4, h1);
h1 = _AVX512_XOR(tau2, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau2, (__m512i) sign);
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
y2 = _AVX512_FMA(y2, h1, _AVX512_MUL(x2,h2));
......@@ -339,13 +340,13 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
h1 = _AVX512_XOR(tau1, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau1, (__m512i) sign);
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
x3 = _AVX512_MUL(x3, h1);
h1 = _AVX512_XOR(tau2, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau2, (__m512i) sign);
h2 = _AVX512_MUL(h1, vs);
y1 = _AVX512_FMA(y1, h1, _AVX512_MUL(x1,h2));
......@@ -461,10 +462,10 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
h1 = _AVX512_XOR(tau1, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau1, (__m512i) sign);
x1 = _AVX512_MUL(x1, h1);
x2 = _AVX512_MUL(x2, h1);
h1 = _AVX512_XOR(tau2, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau2, (__m512i) sign);
h2 = _AVX512_MUL(h1, vs);
......@@ -557,11 +558,11 @@ void double_hh_trafo_real_avx512_2hv_single(float* q, float* hh, int* pnb, int*
__AVX512_DATATYPE tau1 = _AVX512_SET1(hh[0]);
__AVX512_DATATYPE tau2 = _AVX512_SET1(hh[ldh]);
__AVX512_DATATYPE vs = _AVX512_SET1(s);
h1 = _AVX512_XOR(tau1, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau1, (__m512i) sign);
x1 = _AVX512_MUL(x1, h1);
h1 = _AVX512_XOR(tau2, sign);
h1 = (__AVX512_DATATYPE) _AVX512_XOR_EPI((__m512i) tau2, (__m512i) sign);
h2 = _AVX512_MUL(h1, vs);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment