diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c index fca3c23c7..dcc0ce87e 100644 --- a/lib/src/phy/utils/vector_simd.c +++ b/lib/src/phy/utils/vector_simd.c @@ -683,30 +683,35 @@ cf_t srslte_vec_dot_prod_ccc_simd(const cf_t* x, const cf_t* y, const int len) cf_t result = 0; #if SRSLTE_SIMD_CF_SIZE - __attribute__((aligned(64))) cf_t simd_dotProdVector[SRSLTE_SIMD_CF_SIZE]; + if (len >= SRSLTE_SIMD_CF_SIZE) { + simd_cf_t avx_result = srslte_simd_cf_zero(); + if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) { + for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t xVal = srslte_simd_cfi_load(&x[i]); + simd_cf_t yVal = srslte_simd_cfi_load(&y[i]); - simd_cf_t avx_result = srslte_simd_cf_zero(); - if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) { - for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { - simd_cf_t xVal = srslte_simd_cfi_load(&x[i]); - simd_cf_t yVal = srslte_simd_cfi_load(&y[i]); + avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result); + } + } else { + for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]); + simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]); - avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result); - srslte_simd_cfi_store(simd_dotProdVector, avx_result); + avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result); + } } - } else { - for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { - simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]); - simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]); - avx_result = srslte_simd_cf_add(srslte_simd_cf_prod(xVal, yVal), avx_result); - srslte_simd_cfi_storeu(simd_dotProdVector, avx_result); - } - } + __attribute__((aligned(64))) float simd_dotProdVector[SRSLTE_SIMD_CF_SIZE]; + simd_f_t acc_re = srslte_simd_cf_re(avx_result); + simd_f_t acc_im = srslte_simd_cf_im(avx_result); - srslte_simd_cfi_store(simd_dotProdVector, avx_result); - for (int k = 0; k < SRSLTE_SIMD_CF_SIZE; k++) { - result += simd_dotProdVector[k]; + simd_f_t acc = srslte_simd_f_hadd(acc_re, acc_im); + for (int j = 2; j < SRSLTE_SIMD_F_SIZE; j *= 2) { + acc = srslte_simd_f_hadd(acc, acc); + } + srslte_simd_f_store(simd_dotProdVector, acc); + __real__ result = simd_dotProdVector[0]; + __imag__ result = simd_dotProdVector[1]; } #endif @@ -754,28 +759,35 @@ cf_t srslte_vec_dot_prod_conj_ccc_simd(const cf_t* x, const cf_t* y, const int l cf_t result = 0; #if SRSLTE_SIMD_CF_SIZE - __attribute__((aligned(256))) cf_t simd_dotProdVector[SRSLTE_SIMD_CF_SIZE]; + if (len >= SRSLTE_SIMD_CF_SIZE) { + simd_cf_t avx_result = srslte_simd_cf_zero(); + if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) { + for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t xVal = srslte_simd_cfi_load(&x[i]); + simd_cf_t yVal = srslte_simd_cfi_load(&y[i]); - simd_cf_t simd_result = srslte_simd_cf_zero(); - if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) { - for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { - simd_cf_t xVal = srslte_simd_cfi_load(&x[i]); - simd_cf_t yVal = srslte_simd_cfi_load(&y[i]); + avx_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), avx_result); + } + } else { + for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { + simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]); + simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]); - simd_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), simd_result); + avx_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), avx_result); + } } - } else { - for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { - simd_cf_t xVal = srslte_simd_cfi_loadu(&x[i]); - simd_cf_t yVal = srslte_simd_cfi_loadu(&y[i]); - simd_result = srslte_simd_cf_add(srslte_simd_cf_conjprod(xVal, yVal), simd_result); - } - } + __attribute__((aligned(64))) float simd_dotProdVector[SRSLTE_SIMD_CF_SIZE]; + simd_f_t acc_re = srslte_simd_cf_re(avx_result); + simd_f_t acc_im = srslte_simd_cf_im(avx_result); - srslte_simd_cfi_store(simd_dotProdVector, simd_result); - for (int k = 0; k < SRSLTE_SIMD_CF_SIZE; k++) { - result += simd_dotProdVector[k]; + simd_f_t acc = srslte_simd_f_hadd(acc_re, acc_im); + for (int j = 2; j < SRSLTE_SIMD_F_SIZE; j *= 2) { + acc = srslte_simd_f_hadd(acc, acc); + } + srslte_simd_f_store(simd_dotProdVector, acc); + __real__ result = simd_dotProdVector[0]; + __imag__ result = simd_dotProdVector[1]; } #endif