Minor optimization in SIMD dot product kernel

master
Xavier Arteaga 5 years ago committed by Xavier Arteaga
parent f4eb61a37c
commit 6248ab43e3

@ -683,30 +683,35 @@ cf_t srslte_vec_dot_prod_ccc_simd(const cf_t* x, const cf_t* y, const int len)
cf_t result = 0;
#if SRSLTE_SIMD_CF_SIZE
__attribute__((aligned(64))) cf_t simd_dotProdVector[SRSLTE_SIMD_CF_SIZE];
if (len >= SRSLTE_SIMD_CF_SIZE) {
simd_cf_t avx_result = srslte_simd_cf_zero();
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t xVal = srslte_simd_cfi_load(&x[i]);
simd_cf_t yVal = srslte_simd_cfi_load(&y[i]);
simd_cf_t avx_result = srslte_simd_cf_zero();
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t xVal = srslte_simd_cfi_load(&x[i]);
simd_cf_t yVal = srslte_simd_cfi_load(&y[i]);
avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result);
}
} else {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]);
simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]);
avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result);
srslte_simd_cfi_store(simd_dotProdVector, avx_result);
avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result);
}
}
} else {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]);
simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]);
avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result);
srslte_simd_cfi_storeu(simd_dotProdVector, avx_result);
}
}
__attribute__((aligned(64))) float simd_dotProdVector[SRSLTE_SIMD_CF_SIZE];
simd_f_t acc_re = srslte_simd_cf_re(avx_result);
simd_f_t acc_im = srslte_simd_cf_im(avx_result);
srslte_simd_cfi_store(simd_dotProdVector, avx_result);
for (int k = 0; k < SRSLTE_SIMD_CF_SIZE; k++) {
result += simd_dotProdVector[k];
simd_f_t acc = srslte_simd_f_hadd(acc_re, acc_im);
for (int j = 2; j < SRSLTE_SIMD_F_SIZE; j *= 2) {
acc = srslte_simd_f_hadd(acc, acc);
}
srslte_simd_f_store(simd_dotProdVector, acc);
__real__ result = simd_dotProdVector[0];
__imag__ result = simd_dotProdVector[1];
}
#endif
@ -754,28 +759,35 @@ cf_t srslte_vec_dot_prod_conj_ccc_simd(const cf_t* x, const cf_t* y, const int l
cf_t result = 0;
#if SRSLTE_SIMD_CF_SIZE
__attribute__((aligned(256))) cf_t simd_dotProdVector[SRSLTE_SIMD_CF_SIZE];
if (len >= SRSLTE_SIMD_CF_SIZE) {
simd_cf_t avx_result = srslte_simd_cf_zero();
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t xVal = srslte_simd_cfi_load(&x[i]);
simd_cf_t yVal = srslte_simd_cfi_load(&y[i]);
simd_cf_t simd_result = srslte_simd_cf_zero();
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t xVal = srslte_simd_cfi_load(&x[i]);
simd_cf_t yVal = srslte_simd_cfi_load(&y[i]);
avx_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), avx_result);
}
} else {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]);
simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]);
simd_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), simd_result);
avx_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), avx_result);
}
}
} else {
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]);
simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]);
simd_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), simd_result);
}
}
__attribute__((aligned(64))) float simd_dotProdVector[SRSLTE_SIMD_CF_SIZE];
simd_f_t acc_re = srslte_simd_cf_re(avx_result);
simd_f_t acc_im = srslte_simd_cf_im(avx_result);
srslte_simd_cfi_store(simd_dotProdVector, simd_result);
for (int k = 0; k < SRSLTE_SIMD_CF_SIZE; k++) {
result += simd_dotProdVector[k];
simd_f_t acc = srslte_simd_f_hadd(acc_re, acc_im);
for (int j = 2; j < SRSLTE_SIMD_F_SIZE; j *= 2) {
acc = srslte_simd_f_hadd(acc, acc);
}
srslte_simd_f_store(simd_dotProdVector, acc);
__real__ result = simd_dotProdVector[0];
__imag__ result = simd_dotProdVector[1];
}
#endif

Loading…
Cancel
Save