diff --git a/lib/include/srslte/phy/mimo/precoding.h b/lib/include/srslte/phy/mimo/precoding.h index d363231e2..5caf4f0ca 100644 --- a/lib/include/srslte/phy/mimo/precoding.h +++ b/lib/include/srslte/phy/mimo/precoding.h @@ -87,7 +87,7 @@ SRSLTE_API int srslte_predecoding_single(cf_t *y, SRSLTE_API int srslte_predecoding_single_multi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, - float *csi, + float *csi[SRSLTE_MAX_CODEWORDS], int nof_rxant, int nof_symbols, float scaling, @@ -102,7 +102,8 @@ SRSLTE_API int srslte_predecoding_diversity(cf_t *y, SRSLTE_API int srslte_predecoding_diversity_multi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], + cf_t *x[SRSLTE_MAX_LAYERS], + float *csi[SRSLTE_MAX_LAYERS], int nof_rxant, int nof_ports, int nof_symbols, @@ -113,7 +114,7 @@ SRSLTE_API void srslte_predecoding_set_mimo_decoder (srslte_mimo_decoder_t _mimo SRSLTE_API int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], - float *csi, + float *csi[SRSLTE_MAX_CODEWORDS], int nof_rxant, int nof_ports, int nof_layers, diff --git a/lib/include/srslte/phy/utils/mat.h b/lib/include/srslte/phy/utils/mat.h index a0f11abe3..8db0205f9 100644 --- a/lib/include/srslte/phy/utils/mat.h +++ b/lib/include/srslte/phy/utils/mat.h @@ -53,12 +53,17 @@ SRSLTE_API void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1, float noise_estimate, float norm); +SRSLTE_API void srslte_mat_2x2_mmse_csi_gen(cf_t y0, cf_t y1, + cf_t h00, cf_t h01, cf_t h10, cf_t h11, + cf_t *x0, cf_t *x1, float *csi0, float *csi1, + float noise_estimate, + float norm); + SRSLTE_API float srslte_mat_2x2_cn(cf_t h00, cf_t h01, cf_t h10, cf_t h11); - #ifdef LV_HAVE_SSE /* SSE implementation for complex reciprocal */ @@ -103,4 +108,114 @@ SRSLTE_API void srslte_mat_2x2_mmse_avx(__m256 y0, __m256 y1, #endif /* LV_HAVE_AVX */ -#endif // SRSLTE_MAT_H +#if SRSLTE_SIMD_CF_SIZE != 0 + +/* Generic SIMD implementation for 2x2 determinant */ +static inline simd_cf_t srslte_mat_2x2_det_simd(simd_cf_t a00, simd_cf_t a01, simd_cf_t a10, simd_cf_t a11) { + return srslte_simd_cf_sub(srslte_simd_cf_prod(a00, a11), srslte_simd_cf_prod(a01, a10)); +} + +/* Generic SIMD implementation for Zero Forcing (ZF) solver */ +static inline void srslte_mat_2x2_zf_csi_simd(simd_cf_t y0, + simd_cf_t y1, + simd_cf_t h00, + simd_cf_t h01, + simd_cf_t h10, + simd_cf_t h11, + simd_cf_t *x0, + simd_cf_t *x1, + simd_f_t *csi0, + simd_f_t *csi1, + float norm) { + simd_cf_t det = srslte_mat_2x2_det_simd(h00, h01, h10, h11); + simd_cf_t detrec = srslte_simd_cf_mul(srslte_simd_cf_rcp(det), srslte_simd_f_set1(norm)); + + *x0 = srslte_simd_cf_prod(srslte_simd_cf_sub(srslte_simd_cf_prod(h11, y0), srslte_simd_cf_prod(h01, y1)), detrec); + *x1 = srslte_simd_cf_prod(srslte_simd_cf_sub(srslte_simd_cf_prod(h00, y1), srslte_simd_cf_prod(h10, y0)), detrec); + + *csi0 = srslte_simd_f_set1(1.0f); + *csi1 = srslte_simd_f_set1(1.0f); +} + +static inline void srslte_mat_2x2_zf_simd(simd_cf_t y0, + simd_cf_t y1, + simd_cf_t h00, + simd_cf_t h01, + simd_cf_t h10, + simd_cf_t h11, + simd_cf_t *x0, + simd_cf_t *x1, + float norm) { + simd_f_t csi1, csi2; + srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, x0, x1, &csi1, &csi2, norm); +} + +/* Generic SIMD implementation for Minimum Mean Squared Error (MMSE) solver */ +static inline void srslte_mat_2x2_mmse_csi_simd(simd_cf_t y0, + simd_cf_t y1, + simd_cf_t h00, + simd_cf_t h01, + simd_cf_t h10, + simd_cf_t h11, + simd_cf_t *x0, + simd_cf_t *x1, + simd_f_t *csi0, + simd_f_t *csi1, + float noise_estimate, + float norm) { + simd_cf_t _noise_estimate; + simd_f_t _norm = srslte_simd_f_set1(norm); + + _noise_estimate.re = srslte_simd_f_set1(noise_estimate); + _noise_estimate.im = srslte_simd_f_zero(); + + /* 1. A = H' x H + No*/ + simd_cf_t a00 = + srslte_simd_cf_add(srslte_simd_cf_add(srslte_simd_cf_conjprod(h00, h00), srslte_simd_cf_conjprod(h10, h10)), + _noise_estimate); + simd_cf_t a01 = srslte_simd_cf_add(srslte_simd_cf_conjprod(h01, h00), srslte_simd_cf_conjprod(h11, h10)); + simd_cf_t a10 = srslte_simd_cf_add(srslte_simd_cf_conjprod(h00, h01), srslte_simd_cf_conjprod(h10, h11)); + simd_cf_t a11 = + srslte_simd_cf_add(srslte_simd_cf_add(srslte_simd_cf_conjprod(h01, h01), srslte_simd_cf_conjprod(h11, h11)), + _noise_estimate); + simd_cf_t a_det_rcp = srslte_simd_cf_rcp(srslte_mat_2x2_det_simd(a00, a01, a10, a11)); + + /* 2. B = inv(H' x H + No) = inv(A) */ + simd_cf_t _norm2 = srslte_simd_cf_mul(a_det_rcp, _norm); + simd_cf_t b00 = srslte_simd_cf_prod(a11, _norm2); + simd_cf_t b01 = srslte_simd_cf_prod(srslte_simd_cf_neg(a01), _norm2); + simd_cf_t b10 = srslte_simd_cf_prod(srslte_simd_cf_neg(a10), _norm2); + simd_cf_t b11 = srslte_simd_cf_prod(a00, _norm2); + + + /* 3. W = inv(H' x H + No) x H' = B x H' */ + simd_cf_t w00 = srslte_simd_cf_add(srslte_simd_cf_conjprod(b00, h00), srslte_simd_cf_conjprod(b01, h01)); + simd_cf_t w01 = srslte_simd_cf_add(srslte_simd_cf_conjprod(b00, h10), srslte_simd_cf_conjprod(b01, h11)); + simd_cf_t w10 = srslte_simd_cf_add(srslte_simd_cf_conjprod(b10, h00), srslte_simd_cf_conjprod(b11, h01)); + simd_cf_t w11 = srslte_simd_cf_add(srslte_simd_cf_conjprod(b10, h10), srslte_simd_cf_conjprod(b11, h11)); + + /* 4. X = W x Y */ + *x0 = srslte_simd_cf_add(srslte_simd_cf_prod(y0, w00), srslte_simd_cf_prod(y1, w01)); + *x1 = srslte_simd_cf_add(srslte_simd_cf_prod(y0, w10), srslte_simd_cf_prod(y1, w11)); + + /* 5. Extract CSI */ + *csi0 = srslte_simd_f_rcp(srslte_simd_cf_re(b00)); + *csi1 = srslte_simd_f_rcp(srslte_simd_cf_re(b11)); +} + +static inline void srslte_mat_2x2_mmse_simd(simd_cf_t y0, + simd_cf_t y1, + simd_cf_t h00, + simd_cf_t h01, + simd_cf_t h10, + simd_cf_t h11, + simd_cf_t *x0, + simd_cf_t *x1, + float noise_estimate, + float norm) { + simd_f_t csi0, csi1; + srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, x0, x1, &csi0, &csi1, noise_estimate, norm); +} + +#endif /* SRSLTE_SIMD_CF_SIZE != 0 */ +#endif /* SRSLTE_MAT_H */ diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index f6d7a0a44..e9cb4da30 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -197,7 +197,7 @@ static inline simd_f_t srslte_simd_f_loadu(const float *ptr) { #ifdef LV_HAVE_AVX512 return _mm512_loadu_ps(ptr); #else /* LV_HAVE_AVX512 */ - #ifdef LV_HAVE_AVX2 +#ifdef LV_HAVE_AVX2 return _mm256_loadu_ps(ptr); #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE @@ -233,7 +233,7 @@ static inline void srslte_simd_f_storeu(float *ptr, simd_f_t simdreg) { #ifdef LV_HAVE_AVX512 _mm512_storeu_ps(ptr, simdreg); #else /* LV_HAVE_AVX512 */ - #ifdef LV_HAVE_AVX2 +#ifdef LV_HAVE_AVX2 _mm256_storeu_ps(ptr, simdreg); #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE @@ -360,7 +360,7 @@ static inline simd_f_t srslte_simd_f_add(simd_f_t a, simd_f_t b) { #ifdef LV_HAVE_AVX2 return _mm256_add_ps(a, b); #else /* LV_HAVE_AVX2 */ -#ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_add_ps(a, b); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON @@ -376,9 +376,9 @@ static inline simd_f_t srslte_simd_f_zero (void) { return _mm512_setzero_ps(); #else /* LV_HAVE_AVX512 */ #ifdef LV_HAVE_AVX2 - return _mm256_setzero_ps(); + return _mm256_setzero_ps(); #else /* LV_HAVE_AVX2 */ -#ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_setzero_ps(); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON @@ -401,7 +401,7 @@ static inline simd_f_t srslte_simd_f_swap(simd_f_t a) { #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(a))); -#endif /* HAVE_NEON */ +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -443,7 +443,7 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) { #ifdef LV_HAVE_AVX512 return _mm512_sqrt_ps(a); #else /* LV_HAVE_AVX512 */ - #ifdef LV_HAVE_AVX2 +#ifdef LV_HAVE_AVX2 return _mm256_sqrt_ps(a); #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE @@ -458,7 +458,43 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) { float32x4_t zeros = vmovq_n_f32(0); /* Zero vector */ uint32x4_t mask = vceqq_f32(a, zeros); /* Zero vector mask */ return vbslq_f32(mask, zeros, result); /* Force zero results and return */ -#endif /* HAVE_NEON */ +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +static inline simd_f_t srslte_simd_f_neg(simd_f_t a) { +#ifdef LV_HAVE_AVX512 + return _mm512_xor_ps(_mm512_set1_ps(-0.0f), a); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_xor_ps(_mm256_set1_ps(-0.0f), a); +#else /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE + return _mm_xor_ps(_mm_set1_ps(-0.0f), a); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vnegq_f32(a); +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +static inline simd_f_t srslte_simd_f_neg_mask(simd_f_t a, simd_f_t mask) { +#ifdef LV_HAVE_AVX512 + return _mm512_xor_ps(mask, a); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_xor_ps(mask, a); +#else /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE + return _mm_xor_ps(mask, a); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return (float32x4_t) veorq_s32((int32x4_t) a, (int32x4_t) mask); +#endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ @@ -470,12 +506,11 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) { #if SRSLTE_SIMD_CF_SIZE #ifdef HAVE_NEON - typedef float32x4x2_t simd_cf_t; +typedef float32x4x2_t simd_cf_t; #else typedef struct { simd_f_t re; simd_f_t im; - } simd_cf_t; #endif @@ -667,8 +702,8 @@ static inline void srslte_simd_cf_store(float *re, float *im, simd_cf_t simdreg) _mm512_store_ps(im, simdreg.im); #else /* LV_HAVE_AVX512 */ #ifdef LV_HAVE_AVX2 - _mm256_store_ps((float *) re, simdreg.re); - _mm256_store_ps((float *) im, simdreg.im); + _mm256_store_ps(re, simdreg.re); + _mm256_store_ps(im, simdreg.im); #else /* LV_HAVE_AVX512 */ #ifdef LV_HAVE_SSE _mm_store_ps((float *) re, simdreg.re); @@ -689,8 +724,8 @@ static inline void srslte_simd_cf_storeu(float *re, float *im, simd_cf_t simdreg _mm512_storeu_ps(im, simdreg.im); #else /* LV_HAVE_AVX512 */ #ifdef LV_HAVE_AVX2 - _mm256_storeu_ps((float *) re, simdreg.re); - _mm256_storeu_ps((float *) im, simdreg.im); + _mm256_storeu_ps(re, simdreg.re); + _mm256_storeu_ps(im, simdreg.im); #else /* LV_HAVE_AVX512 */ #ifdef LV_HAVE_SSE _mm_storeu_ps((float *) re, simdreg.re); @@ -833,8 +868,32 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) { return ret; } +static inline simd_cf_t srslte_simd_cf_sub (simd_cf_t a, simd_cf_t b) { + simd_cf_t ret; +#ifdef LV_HAVE_AVX512 + ret.re = _mm512_sub_ps(a.re, b.re); + ret.im = _mm512_sub_ps(a.im, b.im); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + ret.re = _mm256_sub_ps(a.re, b.re); + ret.im = _mm256_sub_ps(a.im, b.im); +#else /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE + ret.re = _mm_sub_ps(a.re, b.re); + ret.im = _mm_sub_ps(a.im, b.im); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + ret.val[0] = vsubq_f32(a.val[0],b.val[0]); + ret.val[1] = vsubq_f32(a.val[1],b.val[1]); +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ + return ret; +} + static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) { - simd_cf_t ret; + simd_cf_t ret; #ifdef LV_HAVE_AVX512 ret.re = _mm512_mul_ps(a.re, b); ret.im = _mm512_mul_ps(a.im, b); @@ -855,7 +914,7 @@ static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) { #endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX512 */ - return ret; + return ret; } static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) { @@ -902,6 +961,59 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) { return ret; } +static inline simd_cf_t srslte_simd_cf_neg (simd_cf_t a) { + simd_cf_t ret; +#if LV_HAVE_NEON + ret.val[0] = srslte_simd_f_neg(a.val[0]); + ret.val[1] = srslte_simd_f_neg(a.val[1]); +#else /* LV_HAVE_NEON */ + ret.re = srslte_simd_f_neg(a.re); + ret.im = srslte_simd_f_neg(a.im); +#endif /* LV_HAVE_NEON */ + return ret; +} + +static inline simd_cf_t srslte_simd_cf_neg_mask (simd_cf_t a, simd_f_t mask) { + simd_cf_t ret; +#ifndef LV_HAVE_AVX512 +#ifdef LV_HAVE_AVX2 + mask = _mm256_permutevar8x32_ps(mask, _mm256_setr_epi32(0,4,1,5,2,6,3,7)); +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +#if LV_HAVE_NEON + ret.val[0] = srslte_simd_f_neg_mask(a.val[0], mask); + ret.val[1] = srslte_simd_f_neg_mask(a.val[1], mask); +#else /* LV_HAVE_NEON */ + ret.re = srslte_simd_f_neg_mask(a.re, mask); + ret.im = srslte_simd_f_neg_mask(a.im, mask); +#endif /* LV_HAVE_NEON */ + return ret; +} + +static inline simd_cf_t srslte_simd_cf_conj (simd_cf_t a) { + simd_cf_t ret; +#if LV_HAVE_NEON + ret.val[0] = a.val[0]; + ret.val[1] = srslte_simd_f_neg(a.val[1]); +#else /* LV_HAVE_NEON */ + ret.re = a.re; + ret.im = srslte_simd_f_neg(a.im); +#endif /* LV_HAVE_NEON */ + return ret; +} + +static inline simd_cf_t srslte_simd_cf_mulj (simd_cf_t a) { + simd_cf_t ret; +#if LV_HAVE_NEON + ret.val[0] = srslte_simd_f_neg(a.val[1]); + ret.val[1] = a.val[0]; +#else /* LV_HAVE_NEON */ + ret.re = srslte_simd_f_neg(a.im); + ret.im = a.re; +#endif /* LV_HAVE_NEON */ + return ret; +} + static inline simd_cf_t srslte_simd_cf_zero (void) { simd_cf_t ret; #ifdef LV_HAVE_AVX512 @@ -1057,7 +1169,7 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s int* sel = (int*) &selector; int* c_ptr = (int*) &ret; for(int i = 0;i<4;i++) - { + { if(sel[i] == -1){ c_ptr[i] = b_ptr[i]; }else{ @@ -1115,7 +1227,7 @@ static inline simd_s_t srslte_simd_s_loadu(const int16_t *ptr) { #ifdef LV_HAVE_AVX512 return _mm512_loadu_si512(ptr); #else /* LV_HAVE_AVX512 */ - #ifdef LV_HAVE_AVX2 +#ifdef LV_HAVE_AVX2 return _mm256_loadu_si256((__m256i*) ptr); #else /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_SSE diff --git a/lib/src/phy/mimo/layermap.c b/lib/src/phy/mimo/layermap.c index a5bfc8b92..6de30fca7 100644 --- a/lib/src/phy/mimo/layermap.c +++ b/lib/src/phy/mimo/layermap.c @@ -54,7 +54,7 @@ int srslte_layermap_multiplex(cf_t *d[SRSLTE_MAX_CODEWORDS], cf_t *x[SRSLTE_MAX_ int nof_symbols[SRSLTE_MAX_CODEWORDS]) { if (nof_cw == nof_layers) { for (int i = 0; i < nof_cw; i++) { - srs_vec_cf_cpy(x[i], d[i], (uint32_t) nof_symbols[0]); + srs_vec_cf_cpy(d[i], x[i], (uint32_t) nof_symbols[0]); } return nof_symbols[0]; } else if (nof_cw == 1) { diff --git a/lib/src/phy/mimo/precoding.c b/lib/src/phy/mimo/precoding.c index f8faeda75..2f987c898 100644 --- a/lib/src/phy/mimo/precoding.c +++ b/lib/src/phy/mimo/precoding.c @@ -283,13 +283,13 @@ int srslte_predecoding_single_csi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_ for (; i < nof_symbols; i++) { cf_t r = 0; float hh = 0; - float _scaling = 1.0f / scaling; + float norm = 1.0f / scaling; for (int p = 0; p < nof_rxant; p++) { r += y[p][i] * conj(h[p][i]); hh += (__real__ h[p][i] * __real__ h[p][i]) + (__imag__ h[p][i] * __imag__ h[p][i]); } csi[i] = hh + noise_estimate; - x[i] = r * _scaling / csi[i]; + x[i] = r * norm / csi[i]; } return nof_symbols; } @@ -327,10 +327,10 @@ int srslte_predecoding_single(cf_t *y_, cf_t *h_, cf_t *x, float *csi, int nof_s } /* ZF/MMSE SISO equalizer x=y(h'h+no)^(-1)h' (ZF if n0=0.0)*/ -int srslte_predecoding_single_multi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, float *csi, +int srslte_predecoding_single_multi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, float *csi[SRSLTE_MAX_CODEWORDS], int nof_rxant, int nof_symbols, float scaling, float noise_estimate) { - if (csi) { - return srslte_predecoding_single_csi(y, h, x, csi, nof_rxant, nof_symbols, scaling, noise_estimate); + if (csi && csi[0]) { + return srslte_predecoding_single_csi(y, h, x, csi[0], nof_rxant, nof_symbols, scaling, noise_estimate); } #ifdef LV_HAVE_AVX @@ -566,18 +566,123 @@ int srslte_predecoding_diversity(cf_t *y_, cf_t *h_[SRSLTE_MAX_PORTS], cf_t *x[S #endif } +int srslte_predecoding_diversity_csi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], float *csi[SRSLTE_MAX_CODEWORDS], + int nof_rxant, int nof_ports, int nof_symbols, float scaling) { + int i; + if (nof_ports == 2) { + cf_t h00, h01, h10, h11, r0, r1; + + for (i = 0; i < nof_symbols / 2; i++) { + float hh = 0; + cf_t x0 = 0; + cf_t x1 = 0; + for (int p=0;p 32 && nof_ports == 2) { - return srslte_predecoding_diversity2_sse(y, h, x, nof_rxant, nof_symbols, scaling); + if (csi && csi[0]) { + return srslte_predecoding_diversity_csi(y, h, x, csi, nof_rxant, nof_ports, nof_symbols, scaling); } else { +#ifdef LV_HAVE_SSE + if (nof_symbols > 32 && nof_ports == 2) { + return srslte_predecoding_diversity2_sse(y, h, x, nof_rxant, nof_symbols, scaling); + } else { + return srslte_predecoding_diversity_gen(y, h, x, nof_rxant, nof_ports, nof_symbols, scaling); + } +#else return srslte_predecoding_diversity_gen(y, h, x, nof_rxant, nof_ports, nof_symbols, scaling); +#endif } -#else - return srslte_predecoding_diversity_gen(y, h, x, nof_rxant, nof_ports, nof_symbols, scaling); -#endif } int srslte_precoding_mimo_2x2_gen(cf_t W[2][2], cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], @@ -614,102 +719,164 @@ int srslte_precoding_mimo_2x2_gen(cf_t W[2][2], cf_t *y[SRSLTE_MAX_PORTS], cf_t return SRSLTE_SUCCESS; } -// AVX implementation of ZF 2x2 CCD equalizer -#ifdef LV_HAVE_AVX - -int srslte_predecoding_ccd_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS], - cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], - uint32_t nof_symbols, - float scaling) { +static int srslte_predecoding_ccd_2x2_zf_csi(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + float *csi[SRSLTE_MAX_CODEWORDS], + int nof_symbols, + float scaling) { uint32_t i = 0; + float norm = 2.0f / scaling; + +#if SRSLTE_SIMD_CF_SIZE != 0 + +#if SRSLTE_SIMD_CF_SIZE == 16 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, + +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f + -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f}; +#elif SRSLTE_SIMD_CF_SIZE == 8 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f}; +#elif SRSLTE_SIMD_CF_SIZE == 4 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f}; +#endif - __m256 mask0 = _mm256_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f); - __m256 mask1 = _mm256_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f); + simd_f_t mask1 = srslte_simd_f_loadu(_mask1); + simd_f_t mask2 = srslte_simd_f_loadu(_mask2); - for (i = 0; i < nof_symbols - 3; i += 4) { + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { /* Load channel */ - __m256 h00i = _mm256_load_ps((float *) &h[0][0][i]); - __m256 h01i = _mm256_load_ps((float *) &h[0][1][i]); - __m256 h10i = _mm256_load_ps((float *) &h[1][0][i]); - __m256 h11i = _mm256_load_ps((float *) &h[1][1][i]); + simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]); + simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]); + simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]); + simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]); /* Apply precoding */ - __m256 h00 = _mm256_add_ps(h00i, _mm256_xor_ps(h10i, mask0)); - __m256 h10 = _mm256_add_ps(h01i, _mm256_xor_ps(h11i, mask0)); - __m256 h01 = _mm256_add_ps(h00i, _mm256_xor_ps(h10i, mask1)); - __m256 h11 = _mm256_add_ps(h01i, _mm256_xor_ps(h11i, mask1)); + simd_cf_t h00, h01, h10, h11; + h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask1)); + h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask1)); + h01 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask2)); + h11 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask2)); - __m256 y0 = _mm256_load_ps((float *) &y[0][i]); - __m256 y1 = _mm256_load_ps((float *) &y[1][i]); + simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]); + simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]); - __m256 x0, x1; + simd_cf_t x0, x1; + simd_f_t csi0, csi1; - srslte_mat_2x2_zf_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, 2.0f / scaling); + srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm); - _mm256_store_ps((float *) &x[0][i], x0); - _mm256_store_ps((float *) &x[1][i], x1); + srslte_simd_cfi_store(&x[0][i], x0); + srslte_simd_cfi_store(&x[1][i], x1); + + srslte_simd_f_store(&csi[0][i], csi0); + srslte_simd_f_store(&csi[1][i], csi1); } +#endif /* SRSLTE_SIMD_CF_SIZE != 0 */ - return nof_symbols; -} -#endif /* LV_HAVE_AVX */ + cf_t h00, h01, h10, h11, det; + for (; i < nof_symbols; i++) { -// SSE implementation of ZF 2x2 CCD equalizer -#ifdef LV_HAVE_SSE + // Even precoder + h00 = +h[0][0][i] + h[1][0][i]; + h10 = +h[0][1][i] + h[1][1][i]; + h01 = +h[0][0][i] - h[1][0][i]; + h11 = +h[0][1][i] - h[1][1][i]; + det = (h00 * h11 - h01 * h10); + det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det))); + + x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det; + x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det; + + csi[0][i] = 1.0f; + csi[1][i] = 1.0f; + + i++; + + // Odd precoder + h00 = h[0][0][i] - h[1][0][i]; + h10 = h[0][1][i] - h[1][1][i]; + h01 = h[0][0][i] + h[1][0][i]; + h11 = h[0][1][i] + h[1][1][i]; + det = (h00 * h11 - h01 * h10); + det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det))); + + x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det; + x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det; -int srslte_predecoding_ccd_2x2_zf_sse(cf_t *y[SRSLTE_MAX_PORTS], - cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], - uint32_t nof_symbols, - float scaling) { + csi[0][i] = 1.0f; + csi[1][i] = 1.0f; + } + return SRSLTE_SUCCESS; +} + +static int srslte_predecoding_ccd_2x2_zf(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + int nof_symbols, + float scaling) { uint32_t i = 0; + float norm = 2.0f / scaling; + +#if SRSLTE_SIMD_CF_SIZE != 0 + +#if SRSLTE_SIMD_CF_SIZE == 16 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, + +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f + -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f}; +#elif SRSLTE_SIMD_CF_SIZE == 8 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f}; +#elif SRSLTE_SIMD_CF_SIZE == 4 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f}; +#endif + + simd_f_t mask1 = srslte_simd_f_loadu(_mask1); + simd_f_t mask2 = srslte_simd_f_loadu(_mask2); - for (i = 0; i < nof_symbols - 1; i += 2) { + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { /* Load channel */ - __m128 h00i = _mm_load_ps((float *) &h[0][0][i]); - __m128 h01i = _mm_load_ps((float *) &h[0][1][i]); - __m128 h10i = _mm_load_ps((float *) &h[1][0][i]); - __m128 h11i = _mm_load_ps((float *) &h[1][1][i]); + simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]); + simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]); + simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]); + simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]); /* Apply precoding */ - __m128 h00 = _mm_add_ps(h00i, _mm_xor_ps(h10i, _mm_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f))); - __m128 h10 = _mm_add_ps(h01i, _mm_xor_ps(h11i, _mm_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f))); - __m128 h01 = _mm_add_ps(h00i, _mm_xor_ps(h10i, _mm_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f))); - __m128 h11 = _mm_add_ps(h01i, _mm_xor_ps(h11i, _mm_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f))); + simd_cf_t h00, h01, h10, h11; + h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask1)); + h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask1)); + h01 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask2)); + h11 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask2)); - __m128 y0 = _mm_load_ps((float *) &y[0][i]); - __m128 y1 = _mm_load_ps((float *) &y[1][i]); + simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]); + simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]); - __m128 x0, x1; + simd_cf_t x0, x1; - srslte_mat_2x2_zf_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, 2.0f / scaling); + srslte_mat_2x2_zf_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, norm); - _mm_store_ps((float *) &x[0][i], x0); - _mm_store_ps((float *) &x[1][i], x1); + srslte_simd_cfi_store(&x[0][i], x0); + srslte_simd_cfi_store(&x[1][i], x1); } +#endif /* SRSLTE_SIMD_CF_SIZE != 0 */ - return nof_symbols; -} -#endif - -// Generic implementation of ZF 2x2 CCD equalizer -int srslte_predecoding_ccd_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], - cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], - int nof_symbols, - float scaling) { - cf_t h00, h01, h10, h11; - - for (int i = 0; i < nof_symbols; i++) { + cf_t h00, h01, h10, h11, det; + for (; i < nof_symbols; i++) { // Even precoder h00 = +h[0][0][i] + h[1][0][i]; h10 = +h[0][1][i] + h[1][1][i]; h01 = +h[0][0][i] - h[1][0][i]; h11 = +h[0][1][i] - h[1][1][i]; + det = (h00 * h11 - h01 * h10); + det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det))); - srslte_mat_2x2_zf_gen(y[0][i], y[1][i], h00, h01, h10, h11, &x[0][i], &x[1][i], 2.0f / scaling); + x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det; + x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det; i++; @@ -718,30 +885,35 @@ int srslte_predecoding_ccd_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], h10 = h[0][1][i] - h[1][1][i]; h01 = h[0][0][i] + h[1][0][i]; h11 = h[0][1][i] + h[1][1][i]; + det = (h00 * h11 - h01 * h10); + det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det))); - srslte_mat_2x2_zf_gen(y[0][i], y[1][i], h00, h01, h10, h11, &x[0][i], &x[1][i], 2.0f / scaling); + x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det; + x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det; } return SRSLTE_SUCCESS; } -int srslte_predecoding_ccd_zf(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], - int nof_rxant, int nof_ports, int nof_layers, int nof_symbols, float scaling) -{ +static int srslte_predecoding_ccd_zf(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + float *csi[SRSLTE_MAX_CODEWORDS], + int nof_rxant, + int nof_ports, + int nof_layers, + int nof_symbols, + float scaling) { if (nof_ports == 2 && nof_rxant == 2) { if (nof_layers == 2) { -#ifdef LV_HAVE_AVX - return srslte_predecoding_ccd_2x2_zf_avx(y, h, x, nof_symbols, scaling); -#else -#ifdef LV_HAVE_SSE - return srslte_predecoding_ccd_2x2_zf_sse(y, h, x, nof_symbols, scaling); -#else - return srslte_predecoding_ccd_2x2_zf_gen(y, h, x, nof_symbols, scaling); -#endif /* LV_HAVE_SSE */ -#endif /* LV_HAVE_AVX */ + if (csi && csi[0]) { + return srslte_predecoding_ccd_2x2_zf_csi(y, h, x, csi, nof_symbols, scaling); + } else { + return srslte_predecoding_ccd_2x2_zf(y, h, x, nof_symbols, scaling); + } } else { DEBUG("Error predecoding CCD: Invalid number of layers %d\n", nof_layers); - return -1; - } + return -1; + } } else if (nof_ports == 4) { DEBUG("Error predecoding CCD: Only 2 ports supported\n"); } else { @@ -750,86 +922,155 @@ int srslte_predecoding_ccd_zf(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORT return SRSLTE_ERROR; } -// AVX implementation of MMSE 2x2 CCD equalizer -#ifdef LV_HAVE_AVX +static int srslte_predecoding_ccd_2x2_mmse_csi(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + float *csi[SRSLTE_MAX_CODEWORDS], + int nof_symbols, float scaling, float noise_estimate) { + int i = 0; + float norm = 2.0f / scaling; + +#if SRSLTE_SIMD_CF_SIZE != 0 +#if SRSLTE_SIMD_CF_SIZE == 16 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, + +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, + -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f}; +#elif SRSLTE_SIMD_CF_SIZE == 8 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f}; +#elif SRSLTE_SIMD_CF_SIZE == 4 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f}; +#endif -int srslte_predecoding_ccd_2x2_mmse_avx(cf_t *y[SRSLTE_MAX_PORTS], - cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], - uint32_t nof_symbols, float scaling, float noise_estimate) { - uint32_t i = 0; + simd_f_t mask1 = srslte_simd_f_loadu(_mask1); + simd_f_t mask2 = srslte_simd_f_loadu(_mask2); - for (i = 0; i < nof_symbols - 3; i += 4) { + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { /* Load channel */ - __m256 h00i = _mm256_load_ps((float *) &h[0][0][i]); - __m256 h01i = _mm256_load_ps((float *) &h[0][1][i]); - __m256 h10i = _mm256_load_ps((float *) &h[1][0][i]); - __m256 h11i = _mm256_load_ps((float *) &h[1][1][i]); + simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]); + simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]); + simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]); + simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]); /* Apply precoding */ - __m256 h00 = _mm256_add_ps(h00i, _mm256_xor_ps(h10i, _mm256_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f))); - __m256 h10 = _mm256_add_ps(h01i, _mm256_xor_ps(h11i, _mm256_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f))); - __m256 h01 = _mm256_add_ps(h00i, _mm256_xor_ps(h10i, _mm256_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f))); - __m256 h11 = _mm256_add_ps(h01i, _mm256_xor_ps(h11i, _mm256_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f))); + simd_cf_t h00, h01, h10, h11; + h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask1)); + h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask1)); + h01 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask2)); + h11 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask2)); - __m256 y0 = _mm256_load_ps((float *) &y[0][i]); - __m256 y1 = _mm256_load_ps((float *) &y[1][i]); + simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]); + simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]); - __m256 x0, x1; + simd_cf_t x0, x1; + simd_f_t csi0, csi1; - srslte_mat_2x2_mmse_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, 2.0f / scaling); + srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm); - _mm256_store_ps((float *) &x[0][i], x0); - _mm256_store_ps((float *) &x[1][i], x1); + srslte_simd_cfi_store(&x[0][i], x0); + srslte_simd_cfi_store(&x[1][i], x1); + + srslte_simd_f_store(&csi[0][i], csi0); + srslte_simd_f_store(&csi[1][i], csi1); } +#endif /* SRSLTE_SIMD_CF_SIZE != 0 */ - return nof_symbols; + cf_t h00, h01, h10, h11; + for (; i < nof_symbols; i++) { + + // Even precoder + h00 = +h[0][0][i] + h[1][0][i]; + h10 = +h[0][1][i] + h[1][1][i]; + h01 = +h[0][0][i] - h[1][0][i]; + h11 = +h[0][1][i] - h[1][1][i]; + srslte_mat_2x2_mmse_csi_gen(y[0][i], + y[1][i], + h00, + h01, + h10, + h11, + &x[0][i], + &x[1][i], + &csi[0][i], + &csi[1][i], + noise_estimate, + norm); + i++; + + // Odd precoder + h00 = h[0][0][i] - h[1][0][i]; + h10 = h[0][1][i] - h[1][1][i]; + h01 = h[0][0][i] + h[1][0][i]; + h11 = h[0][1][i] + h[1][1][i]; + srslte_mat_2x2_mmse_csi_gen(y[0][i], + y[1][i], + h00, + h01, + h10, + h11, + &x[0][i], + &x[1][i], + &csi[0][i], + &csi[1][i], + noise_estimate, + norm); + } + return SRSLTE_SUCCESS; } -#endif /* LV_HAVE_AVX */ -// SSE implementation of ZF 2x2 CCD equalizer -#ifdef LV_HAVE_SSE +static int srslte_predecoding_ccd_2x2_mmse(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + int nof_symbols, float scaling, float noise_estimate) { + int i = 0; + float norm = 2.0f / scaling; + +#if SRSLTE_SIMD_CF_SIZE != 0 +#if SRSLTE_SIMD_CF_SIZE == 16 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, + +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, + -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f}; +#elif SRSLTE_SIMD_CF_SIZE == 8 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f}; +#elif SRSLTE_SIMD_CF_SIZE == 4 + float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f}; + float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f}; +#endif -int srslte_predecoding_ccd_2x2_mmse_sse(cf_t *y[SRSLTE_MAX_PORTS], - cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], - uint32_t nof_symbols, float scaling, float noise_estimate) { - uint32_t i = 0; + simd_f_t mask1 = srslte_simd_f_loadu(_mask1); + simd_f_t mask2 = srslte_simd_f_loadu(_mask2); - for (i = 0; i < nof_symbols - 1; i += 2) { + for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) { /* Load channel */ - __m128 h00i = _mm_load_ps((float *) &h[0][0][i]); - __m128 h01i = _mm_load_ps((float *) &h[0][1][i]); - __m128 h10i = _mm_load_ps((float *) &h[1][0][i]); - __m128 h11i = _mm_load_ps((float *) &h[1][1][i]); + simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]); + simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]); + simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]); + simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]); /* Apply precoding */ - __m128 h00 = _mm_add_ps(h00i, _mm_xor_ps(h10i, _mm_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f))); - __m128 h10 = _mm_add_ps(h01i, _mm_xor_ps(h11i, _mm_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f))); - __m128 h01 = _mm_add_ps(h00i, _mm_xor_ps(h10i, _mm_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f))); - __m128 h11 = _mm_add_ps(h01i, _mm_xor_ps(h11i, _mm_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f))); + simd_cf_t h00, h01, h10, h11; + h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask1)); + h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask1)); + h01 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask2)); + h11 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask2)); - __m128 y0 = _mm_load_ps((float *) &y[0][i]); - __m128 y1 = _mm_load_ps((float *) &y[1][i]); + simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]); + simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]); - __m128 x0, x1; + simd_cf_t x0, x1; + srslte_mat_2x2_mmse_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm); - srslte_mat_2x2_mmse_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, 2.0f / scaling); - - _mm_store_ps((float *) &x[0][i], x0); - _mm_store_ps((float *) &x[1][i], x1); + srslte_simd_cfi_store(&x[0][i], x0); + srslte_simd_cfi_store(&x[1][i], x1); } +#endif /* SRSLTE_SIMD_CF_SIZE != 0 */ - return nof_symbols; -} -#endif /* LV_HAVE_SSE */ - -// Generic implementation of ZF 2x2 CCD equalizer -int srslte_predecoding_ccd_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], - int nof_symbols, float scaling, float noise_estimate) { cf_t h00, h01, h10, h11; - - for (int i = 0; i < nof_symbols; i++) { + for (i = 0; i < nof_symbols; i++) { // Even precoder h00 = +h[0][0][i] + h[1][0][i]; @@ -850,21 +1091,23 @@ int srslte_predecoding_ccd_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLT return SRSLTE_SUCCESS; } - -int srslte_predecoding_ccd_mmse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], - int nof_rxant, int nof_ports, int nof_layers, int nof_symbols, float scaling, float noise_estimate) -{ +int srslte_predecoding_ccd_mmse(cf_t *y[SRSLTE_MAX_PORTS], + cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], + cf_t *x[SRSLTE_MAX_LAYERS], + float *csi[SRSLTE_MAX_CODEWORDS], + int nof_rxant, + int nof_ports, + int nof_layers, + int nof_symbols, + float scaling, + float noise_estimate) { if (nof_ports == 2 && nof_rxant == 2) { if (nof_layers == 2) { -#ifdef LV_HAVE_AVX - return srslte_predecoding_ccd_2x2_mmse_avx(y, h, x, nof_symbols, scaling, noise_estimate); -#else -#ifdef LV_HAVE_SSE - return srslte_predecoding_ccd_2x2_mmse_sse(y, h, x, nof_symbols, scaling, noise_estimate); -#else - return srslte_predecoding_ccd_2x2_mmse_gen(y, h, x, nof_symbols, scaling, noise_estimate); -#endif /* LV_HAVE_SSE */ -#endif /* LV_HAVE_AVX */ + if (csi && csi[0]) + return srslte_predecoding_ccd_2x2_mmse_csi(y, h, x, csi, nof_symbols, scaling, noise_estimate); + else { + return srslte_predecoding_ccd_2x2_mmse(y, h, x, nof_symbols, scaling, noise_estimate); + } } else { DEBUG("Error predecoding CCD: Invalid number of layers %d\n", nof_layers); return -1; @@ -1468,7 +1711,7 @@ void srslte_predecoding_set_mimo_decoder (srslte_mimo_decoder_t _mimo_decoder) { /* 36.211 v10.3.0 Section 6.3.4 */ int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], - cf_t *x[SRSLTE_MAX_LAYERS], float *csi, int nof_rxant, int nof_ports, int nof_layers, + cf_t *x[SRSLTE_MAX_LAYERS], float *csi[SRSLTE_MAX_CODEWORDS], int nof_rxant, int nof_ports, int nof_layers, int codebook_idx, int nof_symbols, srslte_mimo_type_t type, float scaling, float noise_estimate) { @@ -1488,10 +1731,10 @@ int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS] if (nof_layers >= 2 && nof_layers <= 4) { switch (mimo_decoder) { case SRSLTE_MIMO_DECODER_ZF: - return srslte_predecoding_ccd_zf(y, h, x, nof_rxant, nof_ports, nof_layers, nof_symbols, scaling); + return srslte_predecoding_ccd_zf(y, h, x, csi, nof_rxant, nof_ports, nof_layers, nof_symbols, scaling); break; case SRSLTE_MIMO_DECODER_MMSE: - return srslte_predecoding_ccd_mmse(y, h, x, nof_rxant, nof_ports, nof_layers, nof_symbols, scaling, noise_estimate); + return srslte_predecoding_ccd_mmse(y, h, x, csi, nof_rxant, nof_ports, nof_layers, nof_symbols, scaling, noise_estimate); break; } } else { @@ -1510,7 +1753,7 @@ int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS] break; case SRSLTE_MIMO_TYPE_TX_DIVERSITY: if (nof_ports == nof_layers) { - return srslte_predecoding_diversity_multi(y, h, x, nof_rxant, nof_ports, nof_symbols, scaling); + return srslte_predecoding_diversity_multi(y, h, x, csi, nof_rxant, nof_ports, nof_symbols, scaling); } else { fprintf(stderr, "Error number of layers must equal number of ports in transmit diversity\n"); diff --git a/lib/src/phy/phch/pcfich.c b/lib/src/phy/phch/pcfich.c index 6b00e768a..c6f1fe127 100644 --- a/lib/src/phy/phch/pcfich.c +++ b/lib/src/phy/phch/pcfich.c @@ -221,7 +221,7 @@ int srslte_pcfich_decode_multi(srslte_pcfich_t *q, cf_t *sf_symbols[SRSLTE_MAX_P /* no need for layer demapping */ srslte_predecoding_single_multi(q_symbols, q_ce[0], q->d, NULL, q->nof_rx_antennas, q->nof_symbols, 1.0f, noise_estimate); } else { - srslte_predecoding_diversity_multi(q_symbols, q_ce, x, q->nof_rx_antennas, q->cell.nof_ports, q->nof_symbols, 1.0f); + srslte_predecoding_diversity_multi(q_symbols, q_ce, x, NULL, q->nof_rx_antennas, q->cell.nof_ports, q->nof_symbols, 1.0f); srslte_layerdemap_diversity(x, q->d, q->cell.nof_ports, q->nof_symbols / q->cell.nof_ports); } diff --git a/lib/src/phy/phch/pdcch.c b/lib/src/phy/phch/pdcch.c index 83f681f8f..1c40333e8 100644 --- a/lib/src/phy/phch/pdcch.c +++ b/lib/src/phy/phch/pdcch.c @@ -492,7 +492,7 @@ int srslte_pdcch_extract_llr_multi(srslte_pdcch_t *q, cf_t *sf_symbols[SRSLTE_MA /* no need for layer demapping */ srslte_predecoding_single_multi(q->symbols, q->ce[0], q->d, NULL, q->nof_rx_antennas, nof_symbols, 1.0f, noise_estimate/2); } else { - srslte_predecoding_diversity_multi(q->symbols, q->ce, x, q->nof_rx_antennas, q->cell.nof_ports, nof_symbols, 1.0f); + srslte_predecoding_diversity_multi(q->symbols, q->ce, x, NULL, q->nof_rx_antennas, q->cell.nof_ports, nof_symbols, 1.0f); srslte_layerdemap_diversity(x, q->d, q->cell.nof_ports, nof_symbols / q->cell.nof_ports); } diff --git a/lib/src/phy/phch/pdsch.c b/lib/src/phy/phch/pdsch.c index c59956a71..6e7a3b33c 100644 --- a/lib/src/phy/phch/pdsch.c +++ b/lib/src/phy/phch/pdsch.c @@ -402,7 +402,7 @@ int srslte_pdsch_enable_csi(srslte_pdsch_t *q, bool enable) { if (enable) { for (int i = 0; i < SRSLTE_MAX_CODEWORDS; i++) { if (!q->csi[i]) { - q->csi[i] = srslte_vec_malloc(sizeof(float) * q->max_re); + q->csi[i] = srslte_vec_malloc(sizeof(float) * q->max_re * 2); if (!q->csi[i]) { return SRSLTE_ERROR; } @@ -757,7 +757,7 @@ int srslte_pdsch_decode(srslte_pdsch_t *q, } // Pre-decoder - if (srslte_predecoding_type(q->symbols, q->ce, x, q->csi[0], q->nof_rx_antennas, q->cell.nof_ports, cfg->nof_layers, + if (srslte_predecoding_type(q->symbols, q->ce, x, q->csi, q->nof_rx_antennas, q->cell.nof_ports, cfg->nof_layers, cfg->codebook_idx, cfg->nbits[0].nof_re, cfg->mimo_type, pdsch_scaling, noise_estimate)<0) { DEBUG("Error predecoding\n"); return SRSLTE_ERROR; diff --git a/lib/src/phy/phch/phich.c b/lib/src/phy/phch/phich.c index 15aa4db88..21bfb040b 100644 --- a/lib/src/phy/phch/phich.c +++ b/lib/src/phy/phch/phich.c @@ -241,7 +241,7 @@ int srslte_phich_decode(srslte_phich_t *q, cf_t *sf_symbols[SRSLTE_MAX_PORTS], /* no need for layer demapping */ srslte_predecoding_single_multi(q_sf_symbols, q_ce[0], q->d0, NULL, q->nof_rx_antennas, SRSLTE_PHICH_MAX_NSYMB, 1.0f, noise_estimate); } else { - srslte_predecoding_diversity_multi(q_sf_symbols, q_ce, x, q->nof_rx_antennas, q->cell.nof_ports, SRSLTE_PHICH_MAX_NSYMB, 1.0f); + srslte_predecoding_diversity_multi(q_sf_symbols, q_ce, x, NULL, q->nof_rx_antennas, q->cell.nof_ports, SRSLTE_PHICH_MAX_NSYMB, 1.0f); srslte_layerdemap_diversity(x, q->d0, q->cell.nof_ports, SRSLTE_PHICH_MAX_NSYMB / q->cell.nof_ports); } DEBUG("Recv!!: \n"); diff --git a/lib/src/phy/utils/mat.c b/lib/src/phy/utils/mat.c index bbfc38135..55b1dc177 100644 --- a/lib/src/phy/utils/mat.c +++ b/lib/src/phy/utils/mat.c @@ -60,8 +60,8 @@ inline void srslte_mat_2x2_zf_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h10 } /* Generic implementation for Minimum Mean Squared Error (MMSE) solver */ -inline void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h10, cf_t h11, - cf_t *x0, cf_t *x1, float noise_estimate, float norm) { +inline void srslte_mat_2x2_mmse_csi_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h10, cf_t h11, + cf_t *x0, cf_t *x1, float *csi0, float *csi1, float noise_estimate, float norm) { /* Create conjugated matrix */ cf_t _h00 = conjf(h00); cf_t _h01 = conjf(h01); @@ -73,14 +73,14 @@ inline void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h cf_t a01 = _h00 * h01 + _h10 * h11; cf_t a10 = _h01 * h00 + _h11 * h10; cf_t a11 = _h01 * h01 + _h11 * h11 + noise_estimate; + cf_t a_det_rcp = srslte_mat_cf_recip_gen(srslte_mat_2x2_det_gen(a00, a01, a10, a11)); /* 2. B = inv(H' x H + No) = inv(A) */ - cf_t b00 = a11; - cf_t b01 = -a01; - cf_t b10 = -a10; - cf_t b11 = a00; - cf_t _norm = norm * srslte_mat_cf_recip_gen(srslte_mat_2x2_det_gen(a00, a01, a10, a11)); - + cf_t _norm = norm * a_det_rcp; + cf_t b00 = a11 * _norm; + cf_t b01 = -a01 * _norm; + cf_t b10 = -a10 * _norm; + cf_t b11 = a00 * _norm; /* 3. W = inv(H' x H + No) x H' = B x H' */ cf_t w00 = b00 * _h00 + b01 * _h01; @@ -89,8 +89,19 @@ inline void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h cf_t w11 = b10 * _h10 + b11 * _h11; /* 4. X = W x Y */ - *x0 = (y0 * w00 + y1 * w01) * _norm; - *x1 = (y0 * w10 + y1 * w11) * _norm; + *x0 = (y0 * w00 + y1 * w01); + *x1 = (y0 * w10 + y1 * w11); + + /* 5. Set CSI */ + *csi0 = 1.0f / crealf(b00); + *csi1 = 1.0f / crealf(b11); +} + +/* Generic implementation for Minimum Mean Squared Error (MMSE) solver */ +void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h10, cf_t h11, + cf_t *x0, cf_t *x1, float noise_estimate, float norm) { + float csi0, csi1; + srslte_mat_2x2_mmse_csi_gen(y0, y1, h00, h01, h10, h11, x0, x1, &csi0, &csi1, noise_estimate, norm); } inline float srslte_mat_2x2_cn(cf_t h00, cf_t h01, cf_t h10, cf_t h11) { diff --git a/lib/src/phy/utils/test/mat_test.c b/lib/src/phy/utils/test/mat_test.c index 0bfb482a9..6ee1c451a 100644 --- a/lib/src/phy/utils/test/mat_test.c +++ b/lib/src/phy/utils/test/mat_test.c @@ -32,8 +32,8 @@ #include #include "srslte/phy/utils/mat.h" -#include "srslte/phy/utils/simd.h" #include "srslte/phy/utils/vector.h" +#include "srslte/phy/utils/vector_simd.h" bool zf_solver = false; @@ -378,6 +378,98 @@ bool test_mmse_solver_avx(void) { #endif /* LV_HAVE_AVX */ +#if SRSLTE_SIMD_CF_SIZE != 0 + +bool test_zf_solver_simd(void) { + cf_t cf_error0, cf_error1; + float error = 0.0f; + + cf_t x0_gold_1 = RANDOM_CF(); + cf_t x1_gold_1 = RANDOM_CF(); + cf_t h00_1 = RANDOM_CF(); + cf_t h01_1 = RANDOM_CF(); + cf_t h10_1 = RANDOM_CF(); + cf_t h11_1 = (1 - h01_1 * h10_1) / h00_1; + cf_t y0_1 = x0_gold_1 * h00_1 + x1_gold_1 * h01_1; + cf_t y1_1 = x0_gold_1 * h10_1 + x1_gold_1 * h11_1; + + simd_cf_t _y0 = srslte_simd_cf_set1(y0_1); + simd_cf_t _y1 = srslte_simd_cf_set1(y1_1); + + simd_cf_t _h00 = srslte_simd_cf_set1(h00_1); + simd_cf_t _h01 = srslte_simd_cf_set1(h01_1); + simd_cf_t _h10 = srslte_simd_cf_set1(h10_1); + simd_cf_t _h11 = srslte_simd_cf_set1(h11_1); + + simd_cf_t _x0, _x1; + + srslte_mat_2x2_zf_simd(_y0, _y1, _h00, _h01, _h10, _h11, &_x0, &_x1, 1.0f); + + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN))) cf_t x0[SRSLTE_SIMD_CF_SIZE]; + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN))) cf_t x1[SRSLTE_SIMD_CF_SIZE]; + + srslte_simd_cfi_store(x0, _x0); + srslte_simd_cfi_store(x1, _x1); + + cf_error0 = x0[1] - x0_gold_1; + cf_error1 = x1[1] - x1_gold_1; + error += crealf(cf_error0) * crealf(cf_error0) + cimagf(cf_error0) * cimagf(cf_error0) + + crealf(cf_error1) * crealf(cf_error1) + cimagf(cf_error1) * cimagf(cf_error1); + + return (error < 1e-3); +} + +bool test_mmse_solver_simd(void) { + cf_t cf_error0, cf_error1; + float error = 0.0f; + + cf_t x0_gold[SRSLTE_SIMD_CF_SIZE]; + cf_t x1_gold[SRSLTE_SIMD_CF_SIZE]; + cf_t h00[SRSLTE_SIMD_CF_SIZE]; + cf_t h01[SRSLTE_SIMD_CF_SIZE]; + cf_t h10[SRSLTE_SIMD_CF_SIZE]; + cf_t h11[SRSLTE_SIMD_CF_SIZE]; + cf_t y0[SRSLTE_SIMD_CF_SIZE]; + cf_t y1[SRSLTE_SIMD_CF_SIZE]; + for (int i = 0; i < SRSLTE_SIMD_CF_SIZE; i++) { + x0_gold[i] = RANDOM_CF(); + x1_gold[i] = RANDOM_CF(); + h00[i] = RANDOM_CF(); + h01[i] = RANDOM_CF(); + h10[i] = RANDOM_CF(); + h11[i] = (1 - h01[i] * h10[i]) / h00[i]; + y0[i] = x0_gold[i] * h00[i]+ x1_gold[i] * h01[i]; + y1[i] = x0_gold[i] * h10[i] + x1_gold[i] * h11[i]; + } + + simd_cf_t _y0 = srslte_simd_cfi_loadu(y0); + simd_cf_t _y1 = srslte_simd_cfi_loadu(y1); + + simd_cf_t _h00 = srslte_simd_cfi_loadu(h00); + simd_cf_t _h01 = srslte_simd_cfi_loadu(h01); + simd_cf_t _h10 = srslte_simd_cfi_loadu(h10); + simd_cf_t _h11 = srslte_simd_cfi_loadu(h11); + + simd_cf_t _x0, _x1; + + srslte_mat_2x2_mmse_simd(_y0, _y1, _h00, _h01, _h10, _h11, &_x0, &_x1, 0.0f, 1.0f); + + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN))) cf_t x0[SRSLTE_SIMD_CF_SIZE]; + __attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN))) cf_t x1[SRSLTE_SIMD_CF_SIZE]; + + srslte_simd_cfi_store(x0, _x0); + srslte_simd_cfi_store(x1, _x1); + + cf_error0 = x0[1] - x0_gold[1]; + cf_error1 = x1[1] - x1_gold[1]; + error += crealf(cf_error0) * crealf(cf_error0) + cimagf(cf_error0) * cimagf(cf_error0) + + crealf(cf_error1) * crealf(cf_error1) + cimagf(cf_error1) * cimagf(cf_error1); + + return (error < 1e-3); +} + +#endif /* SRSLTE_SIMD_CF_SIZE != 0 */ + bool test_vec_dot_prod_ccc(void) { __attribute__((aligned(256))) cf_t a[14]; __attribute__((aligned(256))) cf_t b[14]; @@ -413,6 +505,10 @@ int main(int argc, char **argv) { #ifdef LV_HAVE_AVX RUN_TEST(test_zf_solver_avx); #endif /* LV_HAVE_AVX */ + +#if SRSLTE_SIMD_CF_SIZE != 0 + RUN_TEST(test_zf_solver_simd); +#endif /* SRSLTE_SIMD_CF_SIZE != 0*/ } if (mmse_solver) { @@ -426,6 +522,10 @@ int main(int argc, char **argv) { #ifdef LV_HAVE_AVX RUN_TEST(test_mmse_solver_avx); #endif /* LV_HAVE_AVX */ + +#if SRSLTE_SIMD_CF_SIZE != 0 + RUN_TEST(test_mmse_solver_simd); +#endif /* SRSLTE_SIMD_CF_SIZE != 0*/ } RUN_TEST(test_vec_dot_prod_ccc); diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c index 56010d2c4..d3d836b21 100644 --- a/lib/src/phy/utils/vector.c +++ b/lib/src/phy/utils/vector.c @@ -427,8 +427,8 @@ void srslte_vec_quant_sus(const int16_t *in, uint16_t *out, const float gain, co } } -void srs_vec_cf_cpy(const cf_t *dst, cf_t *src, int len) { - srslte_vec_cp_simd(dst, src, len); +void srs_vec_cf_cpy(const cf_t *src, cf_t *dst, int len) { + srslte_vec_cp_simd(src, dst, len); } void srslte_vec_interleave(const cf_t *x, const cf_t *y, cf_t *z, const int len) {