Added DL CSI decoding to TM2 and TM3

master
Xavier Arteaga 7 years ago
parent 5ad6ef4d1d
commit 0bc3be7abb

@ -87,7 +87,7 @@ SRSLTE_API int srslte_predecoding_single(cf_t *y,
SRSLTE_API int srslte_predecoding_single_multi(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS],
cf_t *x,
float *csi,
float *csi[SRSLTE_MAX_CODEWORDS],
int nof_rxant,
int nof_symbols,
float scaling,
@ -103,6 +103,7 @@ SRSLTE_API int srslte_predecoding_diversity(cf_t *y,
SRSLTE_API int srslte_predecoding_diversity_multi(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
float *csi[SRSLTE_MAX_LAYERS],
int nof_rxant,
int nof_ports,
int nof_symbols,
@ -113,7 +114,7 @@ SRSLTE_API void srslte_predecoding_set_mimo_decoder (srslte_mimo_decoder_t _mimo
SRSLTE_API int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
float *csi,
float *csi[SRSLTE_MAX_CODEWORDS],
int nof_rxant,
int nof_ports,
int nof_layers,

@ -53,12 +53,17 @@ SRSLTE_API void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1,
float noise_estimate,
float norm);
SRSLTE_API void srslte_mat_2x2_mmse_csi_gen(cf_t y0, cf_t y1,
cf_t h00, cf_t h01, cf_t h10, cf_t h11,
cf_t *x0, cf_t *x1, float *csi0, float *csi1,
float noise_estimate,
float norm);
SRSLTE_API float srslte_mat_2x2_cn(cf_t h00,
cf_t h01,
cf_t h10,
cf_t h11);
#ifdef LV_HAVE_SSE
/* SSE implementation for complex reciprocal */
@ -103,4 +108,114 @@ SRSLTE_API void srslte_mat_2x2_mmse_avx(__m256 y0, __m256 y1,
#endif /* LV_HAVE_AVX */
#endif // SRSLTE_MAT_H
#if SRSLTE_SIMD_CF_SIZE != 0
/* Generic SIMD implementation for 2x2 determinant */
static inline simd_cf_t srslte_mat_2x2_det_simd(simd_cf_t a00, simd_cf_t a01, simd_cf_t a10, simd_cf_t a11) {
return srslte_simd_cf_sub(srslte_simd_cf_prod(a00, a11), srslte_simd_cf_prod(a01, a10));
}
/* Generic SIMD implementation for Zero Forcing (ZF) solver */
static inline void srslte_mat_2x2_zf_csi_simd(simd_cf_t y0,
simd_cf_t y1,
simd_cf_t h00,
simd_cf_t h01,
simd_cf_t h10,
simd_cf_t h11,
simd_cf_t *x0,
simd_cf_t *x1,
simd_f_t *csi0,
simd_f_t *csi1,
float norm) {
simd_cf_t det = srslte_mat_2x2_det_simd(h00, h01, h10, h11);
simd_cf_t detrec = srslte_simd_cf_mul(srslte_simd_cf_rcp(det), srslte_simd_f_set1(norm));
*x0 = srslte_simd_cf_prod(srslte_simd_cf_sub(srslte_simd_cf_prod(h11, y0), srslte_simd_cf_prod(h01, y1)), detrec);
*x1 = srslte_simd_cf_prod(srslte_simd_cf_sub(srslte_simd_cf_prod(h00, y1), srslte_simd_cf_prod(h10, y0)), detrec);
*csi0 = srslte_simd_f_set1(1.0f);
*csi1 = srslte_simd_f_set1(1.0f);
}
static inline void srslte_mat_2x2_zf_simd(simd_cf_t y0,
simd_cf_t y1,
simd_cf_t h00,
simd_cf_t h01,
simd_cf_t h10,
simd_cf_t h11,
simd_cf_t *x0,
simd_cf_t *x1,
float norm) {
simd_f_t csi1, csi2;
srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, x0, x1, &csi1, &csi2, norm);
}
/* Generic SIMD implementation for Minimum Mean Squared Error (MMSE) solver */
static inline void srslte_mat_2x2_mmse_csi_simd(simd_cf_t y0,
simd_cf_t y1,
simd_cf_t h00,
simd_cf_t h01,
simd_cf_t h10,
simd_cf_t h11,
simd_cf_t *x0,
simd_cf_t *x1,
simd_f_t *csi0,
simd_f_t *csi1,
float noise_estimate,
float norm) {
simd_cf_t _noise_estimate;
simd_f_t _norm = srslte_simd_f_set1(norm);
_noise_estimate.re = srslte_simd_f_set1(noise_estimate);
_noise_estimate.im = srslte_simd_f_zero();
/* 1. A = H' x H + No*/
simd_cf_t a00 =
srslte_simd_cf_add(srslte_simd_cf_add(srslte_simd_cf_conjprod(h00, h00), srslte_simd_cf_conjprod(h10, h10)),
_noise_estimate);
simd_cf_t a01 = srslte_simd_cf_add(srslte_simd_cf_conjprod(h01, h00), srslte_simd_cf_conjprod(h11, h10));
simd_cf_t a10 = srslte_simd_cf_add(srslte_simd_cf_conjprod(h00, h01), srslte_simd_cf_conjprod(h10, h11));
simd_cf_t a11 =
srslte_simd_cf_add(srslte_simd_cf_add(srslte_simd_cf_conjprod(h01, h01), srslte_simd_cf_conjprod(h11, h11)),
_noise_estimate);
simd_cf_t a_det_rcp = srslte_simd_cf_rcp(srslte_mat_2x2_det_simd(a00, a01, a10, a11));
/* 2. B = inv(H' x H + No) = inv(A) */
simd_cf_t _norm2 = srslte_simd_cf_mul(a_det_rcp, _norm);
simd_cf_t b00 = srslte_simd_cf_prod(a11, _norm2);
simd_cf_t b01 = srslte_simd_cf_prod(srslte_simd_cf_neg(a01), _norm2);
simd_cf_t b10 = srslte_simd_cf_prod(srslte_simd_cf_neg(a10), _norm2);
simd_cf_t b11 = srslte_simd_cf_prod(a00, _norm2);
/* 3. W = inv(H' x H + No) x H' = B x H' */
simd_cf_t w00 = srslte_simd_cf_add(srslte_simd_cf_conjprod(b00, h00), srslte_simd_cf_conjprod(b01, h01));
simd_cf_t w01 = srslte_simd_cf_add(srslte_simd_cf_conjprod(b00, h10), srslte_simd_cf_conjprod(b01, h11));
simd_cf_t w10 = srslte_simd_cf_add(srslte_simd_cf_conjprod(b10, h00), srslte_simd_cf_conjprod(b11, h01));
simd_cf_t w11 = srslte_simd_cf_add(srslte_simd_cf_conjprod(b10, h10), srslte_simd_cf_conjprod(b11, h11));
/* 4. X = W x Y */
*x0 = srslte_simd_cf_add(srslte_simd_cf_prod(y0, w00), srslte_simd_cf_prod(y1, w01));
*x1 = srslte_simd_cf_add(srslte_simd_cf_prod(y0, w10), srslte_simd_cf_prod(y1, w11));
/* 5. Extract CSI */
*csi0 = srslte_simd_f_rcp(srslte_simd_cf_re(b00));
*csi1 = srslte_simd_f_rcp(srslte_simd_cf_re(b11));
}
static inline void srslte_mat_2x2_mmse_simd(simd_cf_t y0,
simd_cf_t y1,
simd_cf_t h00,
simd_cf_t h01,
simd_cf_t h10,
simd_cf_t h11,
simd_cf_t *x0,
simd_cf_t *x1,
float noise_estimate,
float norm) {
simd_f_t csi0, csi1;
srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, x0, x1, &csi0, &csi1, noise_estimate, norm);
}
#endif /* SRSLTE_SIMD_CF_SIZE != 0 */
#endif /* SRSLTE_MAT_H */

@ -464,6 +464,42 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) {
#endif /* LV_HAVE_AVX512 */
}
static inline simd_f_t srslte_simd_f_neg(simd_f_t a) {
#ifdef LV_HAVE_AVX512
return _mm512_xor_ps(_mm512_set1_ps(-0.0f), a);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_xor_ps(_mm256_set1_ps(-0.0f), a);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
return _mm_xor_ps(_mm_set1_ps(-0.0f), a);
#else /* LV_HAVE_SSE */
#ifdef HAVE_NEON
return vnegq_f32(a);
#endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
static inline simd_f_t srslte_simd_f_neg_mask(simd_f_t a, simd_f_t mask) {
#ifdef LV_HAVE_AVX512
return _mm512_xor_ps(mask, a);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_xor_ps(mask, a);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
return _mm_xor_ps(mask, a);
#else /* LV_HAVE_SSE */
#ifdef HAVE_NEON
return (float32x4_t) veorq_s32((int32x4_t) a, (int32x4_t) mask);
#endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
#endif /* SRSLTE_SIMD_F_SIZE */
@ -475,7 +511,6 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) {
typedef struct {
simd_f_t re;
simd_f_t im;
} simd_cf_t;
#endif
@ -667,8 +702,8 @@ static inline void srslte_simd_cf_store(float *re, float *im, simd_cf_t simdreg)
_mm512_store_ps(im, simdreg.im);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
_mm256_store_ps((float *) re, simdreg.re);
_mm256_store_ps((float *) im, simdreg.im);
_mm256_store_ps(re, simdreg.re);
_mm256_store_ps(im, simdreg.im);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_SSE
_mm_store_ps((float *) re, simdreg.re);
@ -689,8 +724,8 @@ static inline void srslte_simd_cf_storeu(float *re, float *im, simd_cf_t simdreg
_mm512_storeu_ps(im, simdreg.im);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
_mm256_storeu_ps((float *) re, simdreg.re);
_mm256_storeu_ps((float *) im, simdreg.im);
_mm256_storeu_ps(re, simdreg.re);
_mm256_storeu_ps(im, simdreg.im);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_SSE
_mm_storeu_ps((float *) re, simdreg.re);
@ -833,6 +868,30 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) {
return ret;
}
static inline simd_cf_t srslte_simd_cf_sub (simd_cf_t a, simd_cf_t b) {
simd_cf_t ret;
#ifdef LV_HAVE_AVX512
ret.re = _mm512_sub_ps(a.re, b.re);
ret.im = _mm512_sub_ps(a.im, b.im);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
ret.re = _mm256_sub_ps(a.re, b.re);
ret.im = _mm256_sub_ps(a.im, b.im);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
ret.re = _mm_sub_ps(a.re, b.re);
ret.im = _mm_sub_ps(a.im, b.im);
#else /* LV_HAVE_SSE */
#ifdef HAVE_NEON
ret.val[0] = vsubq_f32(a.val[0],b.val[0]);
ret.val[1] = vsubq_f32(a.val[1],b.val[1]);
#endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
return ret;
}
static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) {
simd_cf_t ret;
#ifdef LV_HAVE_AVX512
@ -902,6 +961,59 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
return ret;
}
static inline simd_cf_t srslte_simd_cf_neg (simd_cf_t a) {
simd_cf_t ret;
#if LV_HAVE_NEON
ret.val[0] = srslte_simd_f_neg(a.val[0]);
ret.val[1] = srslte_simd_f_neg(a.val[1]);
#else /* LV_HAVE_NEON */
ret.re = srslte_simd_f_neg(a.re);
ret.im = srslte_simd_f_neg(a.im);
#endif /* LV_HAVE_NEON */
return ret;
}
static inline simd_cf_t srslte_simd_cf_neg_mask (simd_cf_t a, simd_f_t mask) {
simd_cf_t ret;
#ifndef LV_HAVE_AVX512
#ifdef LV_HAVE_AVX2
mask = _mm256_permutevar8x32_ps(mask, _mm256_setr_epi32(0,4,1,5,2,6,3,7));
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
#if LV_HAVE_NEON
ret.val[0] = srslte_simd_f_neg_mask(a.val[0], mask);
ret.val[1] = srslte_simd_f_neg_mask(a.val[1], mask);
#else /* LV_HAVE_NEON */
ret.re = srslte_simd_f_neg_mask(a.re, mask);
ret.im = srslte_simd_f_neg_mask(a.im, mask);
#endif /* LV_HAVE_NEON */
return ret;
}
static inline simd_cf_t srslte_simd_cf_conj (simd_cf_t a) {
simd_cf_t ret;
#if LV_HAVE_NEON
ret.val[0] = a.val[0];
ret.val[1] = srslte_simd_f_neg(a.val[1]);
#else /* LV_HAVE_NEON */
ret.re = a.re;
ret.im = srslte_simd_f_neg(a.im);
#endif /* LV_HAVE_NEON */
return ret;
}
static inline simd_cf_t srslte_simd_cf_mulj (simd_cf_t a) {
simd_cf_t ret;
#if LV_HAVE_NEON
ret.val[0] = srslte_simd_f_neg(a.val[1]);
ret.val[1] = a.val[0];
#else /* LV_HAVE_NEON */
ret.re = srslte_simd_f_neg(a.im);
ret.im = a.re;
#endif /* LV_HAVE_NEON */
return ret;
}
static inline simd_cf_t srslte_simd_cf_zero (void) {
simd_cf_t ret;
#ifdef LV_HAVE_AVX512

@ -54,7 +54,7 @@ int srslte_layermap_multiplex(cf_t *d[SRSLTE_MAX_CODEWORDS], cf_t *x[SRSLTE_MAX_
int nof_symbols[SRSLTE_MAX_CODEWORDS]) {
if (nof_cw == nof_layers) {
for (int i = 0; i < nof_cw; i++) {
srs_vec_cf_cpy(x[i], d[i], (uint32_t) nof_symbols[0]);
srs_vec_cf_cpy(d[i], x[i], (uint32_t) nof_symbols[0]);
}
return nof_symbols[0];
} else if (nof_cw == 1) {

@ -283,13 +283,13 @@ int srslte_predecoding_single_csi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_
for (; i < nof_symbols; i++) {
cf_t r = 0;
float hh = 0;
float _scaling = 1.0f / scaling;
float norm = 1.0f / scaling;
for (int p = 0; p < nof_rxant; p++) {
r += y[p][i] * conj(h[p][i]);
hh += (__real__ h[p][i] * __real__ h[p][i]) + (__imag__ h[p][i] * __imag__ h[p][i]);
}
csi[i] = hh + noise_estimate;
x[i] = r * _scaling / csi[i];
x[i] = r * norm / csi[i];
}
return nof_symbols;
}
@ -327,10 +327,10 @@ int srslte_predecoding_single(cf_t *y_, cf_t *h_, cf_t *x, float *csi, int nof_s
}
/* ZF/MMSE SISO equalizer x=y(h'h+no)^(-1)h' (ZF if n0=0.0)*/
int srslte_predecoding_single_multi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, float *csi,
int srslte_predecoding_single_multi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, float *csi[SRSLTE_MAX_CODEWORDS],
int nof_rxant, int nof_symbols, float scaling, float noise_estimate) {
if (csi) {
return srslte_predecoding_single_csi(y, h, x, csi, nof_rxant, nof_symbols, scaling, noise_estimate);
if (csi && csi[0]) {
return srslte_predecoding_single_csi(y, h, x, csi[0], nof_rxant, nof_symbols, scaling, noise_estimate);
}
#ifdef LV_HAVE_AVX
@ -566,9 +566,113 @@ int srslte_predecoding_diversity(cf_t *y_, cf_t *h_[SRSLTE_MAX_PORTS], cf_t *x[S
#endif
}
int srslte_predecoding_diversity_csi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], float *csi[SRSLTE_MAX_CODEWORDS],
int nof_rxant, int nof_ports, int nof_symbols, float scaling) {
int i;
if (nof_ports == 2) {
cf_t h00, h01, h10, h11, r0, r1;
for (i = 0; i < nof_symbols / 2; i++) {
float hh = 0;
cf_t x0 = 0;
cf_t x1 = 0;
for (int p=0;p<nof_rxant;p++) {
h00 = h[0][p][2 * i];
h01 = h[0][p][2 * i+1];
h10 = h[1][p][2 * i];
h11 = h[1][p][2 * i+1];
hh += crealf(h00) * crealf(h00) + cimagf(h00) * cimagf(h00)
+ crealf(h11) * crealf(h11) + cimagf(h11) * cimagf(h11);
r0 = y[p][2 * i];
r1 = y[p][2 * i + 1];
if (hh == 0) {
hh = 1e-4;
}
x0 += (conjf(h00) * r0 + h11 * conjf(r1));
x1 += (-h10 * conj(r0) + conj(h01) * r1);
}
csi[0][2*i + 0] = hh;
csi[0][2*i + 1] = hh;
hh *= scaling;
x[0][i] = x0 / hh * sqrt(2);
x[1][i] = x1 / hh * sqrt(2);
}
return i;
} else if (nof_ports == 4) {
int m_ap = (nof_symbols % 4) ? ((nof_symbols - 2) / 4) : nof_symbols / 4;
for (i = 0; i < m_ap; i++) {
cf_t x0 = 0, x1 = 0, x2 = 0, x3 = 0;
float a0 = 0, a1 = 0, a2 = 0, a3 = 0;
cf_t r0, r1, r2, r3;
cf_t h00, h01, h10, h11;
for (int p=0;p<nof_rxant;p++) {
h00 = h[0][p][4 * i + 0];
h01 = h[2][p][4 * i + 0];
h10 = h[0][p][4 * i + 1];
h11 = h[2][p][4 * i + 1];
a0 += __real__ h00 * __real__ h00 + __imag__ h00 * __imag__ h00
+ __real__ h11 * __real__ h11 + __imag__ h11 * __imag__ h11;
a1 += __real__ h10 * __real__ h10 + __imag__ h10 * __imag__ h10
+ __real__ h01 * __real__ h01 + __imag__ h01 * __imag__ h01;
r0 = y[p][4 * i];
r1 = y[p][4 * i + 1];
x0 += (conjf(h00) * r0 + h11 * conjf(r1));
x1 += (-h01 * conjf(r0) + conjf(h10) * r1);
h00 = h[1][p][4 * i + 2];
h01 = h[3][p][4 * i + 2];
h10 = h[1][p][4 * i + 3];
h11 = h[3][p][4 * i + 3];
a2 += __real__ h00 * __real__ h00 + __imag__ h00 * __imag__ h00
+ __real__ h11 * __real__ h11 + __imag__ h11 * __imag__ h11;
a3 += __real__ h10 * __real__ h10 + __imag__ h10 * __imag__ h10
+ __real__ h01 * __real__ h01 + __imag__ h01 * __imag__ h01;
r2 = y[p][4 * i + 2];
r3 = y[p][4 * i + 3];
x2 += (conjf(h00) * r2 + h11 * conjf(r3));
x3 += (-h01 * conjf(r2) + conjf(h10) * r3);
}
a0 *= scaling;
a1 *= scaling;
a2 *= scaling;
a3 *= scaling;
csi[0][4 * i + 0] = a0 / nof_rxant;
csi[0][4 * i + 1] = a1 / nof_rxant;
csi[0][4 * i + 2] = a2 / nof_rxant;
csi[0][4 * i + 3] = a3 / nof_rxant;
x[0][i] = x0 / a0 * sqrtf(2.0f);
x[1][i] = x1 / a1 * sqrtf(2.0f);
x[2][i] = x2 / a2 * sqrtf(2.0f);
x[3][i] = x3 / a3 * sqrtf(2.0f);
}
return i;
} else {
fprintf(stderr, "Number of ports must be 2 or 4 for transmit diversity (nof_ports=%d)\n", nof_ports);
return -1;
}
}
int srslte_predecoding_diversity_multi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS],
int nof_rxant, int nof_ports, int nof_symbols, float scaling)
float *csi[SRSLTE_MAX_CODEWORDS], int nof_rxant, int nof_ports, int nof_symbols, float scaling)
{
if (csi && csi[0]) {
return srslte_predecoding_diversity_csi(y, h, x, csi, nof_rxant, nof_ports, nof_symbols, scaling);
} else {
#ifdef LV_HAVE_SSE
if (nof_symbols > 32 && nof_ports == 2) {
return srslte_predecoding_diversity2_sse(y, h, x, nof_rxant, nof_symbols, scaling);
@ -579,6 +683,7 @@ int srslte_predecoding_diversity_multi(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE
return srslte_predecoding_diversity_gen(y, h, x, nof_rxant, nof_ports, nof_symbols, scaling);
#endif
}
}
int srslte_precoding_mimo_2x2_gen(cf_t W[2][2], cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS],
int nof_symbols, float scaling, float noise_estimate)
@ -614,102 +719,164 @@ int srslte_precoding_mimo_2x2_gen(cf_t W[2][2], cf_t *y[SRSLTE_MAX_PORTS], cf_t
return SRSLTE_SUCCESS;
}
// AVX implementation of ZF 2x2 CCD equalizer
#ifdef LV_HAVE_AVX
int srslte_predecoding_ccd_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS],
static int srslte_predecoding_ccd_2x2_zf_csi(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
uint32_t nof_symbols,
float *csi[SRSLTE_MAX_CODEWORDS],
int nof_symbols,
float scaling) {
uint32_t i = 0;
float norm = 2.0f / scaling;
#if SRSLTE_SIMD_CF_SIZE != 0
#if SRSLTE_SIMD_CF_SIZE == 16
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f,
+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f
-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f};
#elif SRSLTE_SIMD_CF_SIZE == 8
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f};
#elif SRSLTE_SIMD_CF_SIZE == 4
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f};
#endif
__m256 mask0 = _mm256_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f);
__m256 mask1 = _mm256_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f);
simd_f_t mask1 = srslte_simd_f_loadu(_mask1);
simd_f_t mask2 = srslte_simd_f_loadu(_mask2);
for (i = 0; i < nof_symbols - 3; i += 4) {
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
/* Load channel */
__m256 h00i = _mm256_load_ps((float *) &h[0][0][i]);
__m256 h01i = _mm256_load_ps((float *) &h[0][1][i]);
__m256 h10i = _mm256_load_ps((float *) &h[1][0][i]);
__m256 h11i = _mm256_load_ps((float *) &h[1][1][i]);
simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
/* Apply precoding */
__m256 h00 = _mm256_add_ps(h00i, _mm256_xor_ps(h10i, mask0));
__m256 h10 = _mm256_add_ps(h01i, _mm256_xor_ps(h11i, mask0));
__m256 h01 = _mm256_add_ps(h00i, _mm256_xor_ps(h10i, mask1));
__m256 h11 = _mm256_add_ps(h01i, _mm256_xor_ps(h11i, mask1));
simd_cf_t h00, h01, h10, h11;
h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask1));
h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask1));
h01 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask2));
h11 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask2));
__m256 y0 = _mm256_load_ps((float *) &y[0][i]);
__m256 y1 = _mm256_load_ps((float *) &y[1][i]);
simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
__m256 x0, x1;
simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_zf_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, 2.0f / scaling);
srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm);
_mm256_store_ps((float *) &x[0][i], x0);
_mm256_store_ps((float *) &x[1][i], x1);
}
srslte_simd_cfi_store(&x[0][i], x0);
srslte_simd_cfi_store(&x[1][i], x1);
return nof_symbols;
srslte_simd_f_store(&csi[0][i], csi0);
srslte_simd_f_store(&csi[1][i], csi1);
}
#endif /* LV_HAVE_AVX */
#endif /* SRSLTE_SIMD_CF_SIZE != 0 */
// SSE implementation of ZF 2x2 CCD equalizer
#ifdef LV_HAVE_SSE
cf_t h00, h01, h10, h11, det;
for (; i < nof_symbols; i++) {
int srslte_predecoding_ccd_2x2_zf_sse(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
uint32_t nof_symbols,
float scaling) {
uint32_t i = 0;
// Even precoder
h00 = +h[0][0][i] + h[1][0][i];
h10 = +h[0][1][i] + h[1][1][i];
h01 = +h[0][0][i] - h[1][0][i];
h11 = +h[0][1][i] - h[1][1][i];
det = (h00 * h11 - h01 * h10);
det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det)));
for (i = 0; i < nof_symbols - 1; i += 2) {
/* Load channel */
__m128 h00i = _mm_load_ps((float *) &h[0][0][i]);
__m128 h01i = _mm_load_ps((float *) &h[0][1][i]);
__m128 h10i = _mm_load_ps((float *) &h[1][0][i]);
__m128 h11i = _mm_load_ps((float *) &h[1][1][i]);
x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det;
x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det;
/* Apply precoding */
__m128 h00 = _mm_add_ps(h00i, _mm_xor_ps(h10i, _mm_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f)));
__m128 h10 = _mm_add_ps(h01i, _mm_xor_ps(h11i, _mm_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f)));
__m128 h01 = _mm_add_ps(h00i, _mm_xor_ps(h10i, _mm_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f)));
__m128 h11 = _mm_add_ps(h01i, _mm_xor_ps(h11i, _mm_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f)));
csi[0][i] = 1.0f;
csi[1][i] = 1.0f;
__m128 y0 = _mm_load_ps((float *) &y[0][i]);
__m128 y1 = _mm_load_ps((float *) &y[1][i]);
i++;
__m128 x0, x1;
// Odd precoder
h00 = h[0][0][i] - h[1][0][i];
h10 = h[0][1][i] - h[1][1][i];
h01 = h[0][0][i] + h[1][0][i];
h11 = h[0][1][i] + h[1][1][i];
det = (h00 * h11 - h01 * h10);
det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det)));
srslte_mat_2x2_zf_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, 2.0f / scaling);
x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det;
x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det;
_mm_store_ps((float *) &x[0][i], x0);
_mm_store_ps((float *) &x[1][i], x1);
csi[0][i] = 1.0f;
csi[1][i] = 1.0f;
}
return nof_symbols;
return SRSLTE_SUCCESS;
}
#endif
// Generic implementation of ZF 2x2 CCD equalizer
int srslte_predecoding_ccd_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS],
static int srslte_predecoding_ccd_2x2_zf(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
int nof_symbols,
float scaling) {
cf_t h00, h01, h10, h11;
uint32_t i = 0;
float norm = 2.0f / scaling;
#if SRSLTE_SIMD_CF_SIZE != 0
#if SRSLTE_SIMD_CF_SIZE == 16
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f,
+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f
-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f};
#elif SRSLTE_SIMD_CF_SIZE == 8
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f};
#elif SRSLTE_SIMD_CF_SIZE == 4
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, +0.0f, -0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, -0.0f, +0.0f, +0.0f};
#endif
for (int i = 0; i < nof_symbols; i++) {
simd_f_t mask1 = srslte_simd_f_loadu(_mask1);
simd_f_t mask2 = srslte_simd_f_loadu(_mask2);
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
/* Load channel */
simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
/* Apply precoding */
simd_cf_t h00, h01, h10, h11;
h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask1));
h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask1));
h01 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask2));
h11 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask2));
simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
simd_cf_t x0, x1;
srslte_mat_2x2_zf_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, norm);
srslte_simd_cfi_store(&x[0][i], x0);
srslte_simd_cfi_store(&x[1][i], x1);
}
#endif /* SRSLTE_SIMD_CF_SIZE != 0 */
cf_t h00, h01, h10, h11, det;
for (; i < nof_symbols; i++) {
// Even precoder
h00 = +h[0][0][i] + h[1][0][i];
h10 = +h[0][1][i] + h[1][1][i];
h01 = +h[0][0][i] - h[1][0][i];
h11 = +h[0][1][i] - h[1][1][i];
det = (h00 * h11 - h01 * h10);
det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det)));
srslte_mat_2x2_zf_gen(y[0][i], y[1][i], h00, h01, h10, h11, &x[0][i], &x[1][i], 2.0f / scaling);
x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det;
x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det;
i++;
@ -718,26 +885,31 @@ int srslte_predecoding_ccd_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS],
h10 = h[0][1][i] - h[1][1][i];
h01 = h[0][0][i] + h[1][0][i];
h11 = h[0][1][i] + h[1][1][i];
det = (h00 * h11 - h01 * h10);
det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det)));
srslte_mat_2x2_zf_gen(y[0][i], y[1][i], h00, h01, h10, h11, &x[0][i], &x[1][i], 2.0f / scaling);
x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det;
x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det;
}
return SRSLTE_SUCCESS;
}
int srslte_predecoding_ccd_zf(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS],
int nof_rxant, int nof_ports, int nof_layers, int nof_symbols, float scaling)
{
static int srslte_predecoding_ccd_zf(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
float *csi[SRSLTE_MAX_CODEWORDS],
int nof_rxant,
int nof_ports,
int nof_layers,
int nof_symbols,
float scaling) {
if (nof_ports == 2 && nof_rxant == 2) {
if (nof_layers == 2) {
#ifdef LV_HAVE_AVX
return srslte_predecoding_ccd_2x2_zf_avx(y, h, x, nof_symbols, scaling);
#else
#ifdef LV_HAVE_SSE
return srslte_predecoding_ccd_2x2_zf_sse(y, h, x, nof_symbols, scaling);
#else
return srslte_predecoding_ccd_2x2_zf_gen(y, h, x, nof_symbols, scaling);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX */
if (csi && csi[0]) {
return srslte_predecoding_ccd_2x2_zf_csi(y, h, x, csi, nof_symbols, scaling);
} else {
return srslte_predecoding_ccd_2x2_zf(y, h, x, nof_symbols, scaling);
}
} else {
DEBUG("Error predecoding CCD: Invalid number of layers %d\n", nof_layers);
return -1;
@ -750,86 +922,155 @@ int srslte_predecoding_ccd_zf(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORT
return SRSLTE_ERROR;
}
// AVX implementation of MMSE 2x2 CCD equalizer
#ifdef LV_HAVE_AVX
int srslte_predecoding_ccd_2x2_mmse_avx(cf_t *y[SRSLTE_MAX_PORTS],
static int srslte_predecoding_ccd_2x2_mmse_csi(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
uint32_t nof_symbols, float scaling, float noise_estimate) {
uint32_t i = 0;
float *csi[SRSLTE_MAX_CODEWORDS],
int nof_symbols, float scaling, float noise_estimate) {
int i = 0;
float norm = 2.0f / scaling;
#if SRSLTE_SIMD_CF_SIZE != 0
#if SRSLTE_SIMD_CF_SIZE == 16
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f,
+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f,
-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f};
#elif SRSLTE_SIMD_CF_SIZE == 8
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f};
#elif SRSLTE_SIMD_CF_SIZE == 4
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f};
#endif
for (i = 0; i < nof_symbols - 3; i += 4) {
simd_f_t mask1 = srslte_simd_f_loadu(_mask1);
simd_f_t mask2 = srslte_simd_f_loadu(_mask2);
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
/* Load channel */
__m256 h00i = _mm256_load_ps((float *) &h[0][0][i]);
__m256 h01i = _mm256_load_ps((float *) &h[0][1][i]);
__m256 h10i = _mm256_load_ps((float *) &h[1][0][i]);
__m256 h11i = _mm256_load_ps((float *) &h[1][1][i]);
simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
/* Apply precoding */
__m256 h00 = _mm256_add_ps(h00i, _mm256_xor_ps(h10i, _mm256_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f)));
__m256 h10 = _mm256_add_ps(h01i, _mm256_xor_ps(h11i, _mm256_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f)));
__m256 h01 = _mm256_add_ps(h00i, _mm256_xor_ps(h10i, _mm256_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f)));
__m256 h11 = _mm256_add_ps(h01i, _mm256_xor_ps(h11i, _mm256_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +0.0f, +0.0f)));
simd_cf_t h00, h01, h10, h11;
h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask1));
h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask1));
h01 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask2));
h11 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask2));
__m256 y0 = _mm256_load_ps((float *) &y[0][i]);
__m256 y1 = _mm256_load_ps((float *) &y[1][i]);
simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
__m256 x0, x1;
simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_mmse_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, 2.0f / scaling);
srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm);
_mm256_store_ps((float *) &x[0][i], x0);
_mm256_store_ps((float *) &x[1][i], x1);
}
srslte_simd_cfi_store(&x[0][i], x0);
srslte_simd_cfi_store(&x[1][i], x1);
return nof_symbols;
srslte_simd_f_store(&csi[0][i], csi0);
srslte_simd_f_store(&csi[1][i], csi1);
}
#endif /* LV_HAVE_AVX */
#endif /* SRSLTE_SIMD_CF_SIZE != 0 */
// SSE implementation of ZF 2x2 CCD equalizer
#ifdef LV_HAVE_SSE
cf_t h00, h01, h10, h11;
for (; i < nof_symbols; i++) {
// Even precoder
h00 = +h[0][0][i] + h[1][0][i];
h10 = +h[0][1][i] + h[1][1][i];
h01 = +h[0][0][i] - h[1][0][i];
h11 = +h[0][1][i] - h[1][1][i];
srslte_mat_2x2_mmse_csi_gen(y[0][i],
y[1][i],
h00,
h01,
h10,
h11,
&x[0][i],
&x[1][i],
&csi[0][i],
&csi[1][i],
noise_estimate,
norm);
i++;
// Odd precoder
h00 = h[0][0][i] - h[1][0][i];
h10 = h[0][1][i] - h[1][1][i];
h01 = h[0][0][i] + h[1][0][i];
h11 = h[0][1][i] + h[1][1][i];
srslte_mat_2x2_mmse_csi_gen(y[0][i],
y[1][i],
h00,
h01,
h10,
h11,
&x[0][i],
&x[1][i],
&csi[0][i],
&csi[1][i],
noise_estimate,
norm);
}
return SRSLTE_SUCCESS;
}
int srslte_predecoding_ccd_2x2_mmse_sse(cf_t *y[SRSLTE_MAX_PORTS],
static int srslte_predecoding_ccd_2x2_mmse(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
uint32_t nof_symbols, float scaling, float noise_estimate) {
uint32_t i = 0;
int nof_symbols, float scaling, float noise_estimate) {
int i = 0;
float norm = 2.0f / scaling;
#if SRSLTE_SIMD_CF_SIZE != 0
#if SRSLTE_SIMD_CF_SIZE == 16
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f,
+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f,
-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f};
#elif SRSLTE_SIMD_CF_SIZE == 8
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f};
#elif SRSLTE_SIMD_CF_SIZE == 4
float _mask1[SRSLTE_SIMD_CF_SIZE] = {+0.0f, -0.0f, +0.0f, -0.0f};
float _mask2[SRSLTE_SIMD_CF_SIZE] = {-0.0f, +0.0f, -0.0f, +0.0f};
#endif
simd_f_t mask1 = srslte_simd_f_loadu(_mask1);
simd_f_t mask2 = srslte_simd_f_loadu(_mask2);
for (i = 0; i < nof_symbols - 1; i += 2) {
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
/* Load channel */
__m128 h00i = _mm_load_ps((float *) &h[0][0][i]);
__m128 h01i = _mm_load_ps((float *) &h[0][1][i]);
__m128 h10i = _mm_load_ps((float *) &h[1][0][i]);
__m128 h11i = _mm_load_ps((float *) &h[1][1][i]);
simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
/* Apply precoding */
__m128 h00 = _mm_add_ps(h00i, _mm_xor_ps(h10i, _mm_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f)));
__m128 h10 = _mm_add_ps(h01i, _mm_xor_ps(h11i, _mm_setr_ps(+0.0f, +0.0f, -0.0f, -0.0f)));
__m128 h01 = _mm_add_ps(h00i, _mm_xor_ps(h10i, _mm_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f)));
__m128 h11 = _mm_add_ps(h01i, _mm_xor_ps(h11i, _mm_setr_ps(-0.0f, -0.0f, +0.0f, +0.0f)));
simd_cf_t h00, h01, h10, h11;
h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask1));
h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask1));
h01 = srslte_simd_cf_add(h00i, srslte_simd_cf_neg_mask(h10i, mask2));
h11 = srslte_simd_cf_add(h01i, srslte_simd_cf_neg_mask(h11i, mask2));
__m128 y0 = _mm_load_ps((float *) &y[0][i]);
__m128 y1 = _mm_load_ps((float *) &y[1][i]);
__m128 x0, x1;
simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
srslte_mat_2x2_mmse_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, 2.0f / scaling);
_mm_store_ps((float *) &x[0][i], x0);
_mm_store_ps((float *) &x[1][i], x1);
}
simd_cf_t x0, x1;
srslte_mat_2x2_mmse_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm);
return nof_symbols;
srslte_simd_cfi_store(&x[0][i], x0);
srslte_simd_cfi_store(&x[1][i], x1);
}
#endif /* LV_HAVE_SSE */
#endif /* SRSLTE_SIMD_CF_SIZE != 0 */
// Generic implementation of ZF 2x2 CCD equalizer
int srslte_predecoding_ccd_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS],
int nof_symbols, float scaling, float noise_estimate) {
cf_t h00, h01, h10, h11;
for (int i = 0; i < nof_symbols; i++) {
for (i = 0; i < nof_symbols; i++) {
// Even precoder
h00 = +h[0][0][i] + h[1][0][i];
@ -850,21 +1091,23 @@ int srslte_predecoding_ccd_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLT
return SRSLTE_SUCCESS;
}
int srslte_predecoding_ccd_mmse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS],
int nof_rxant, int nof_ports, int nof_layers, int nof_symbols, float scaling, float noise_estimate)
{
int srslte_predecoding_ccd_mmse(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
float *csi[SRSLTE_MAX_CODEWORDS],
int nof_rxant,
int nof_ports,
int nof_layers,
int nof_symbols,
float scaling,
float noise_estimate) {
if (nof_ports == 2 && nof_rxant == 2) {
if (nof_layers == 2) {
#ifdef LV_HAVE_AVX
return srslte_predecoding_ccd_2x2_mmse_avx(y, h, x, nof_symbols, scaling, noise_estimate);
#else
#ifdef LV_HAVE_SSE
return srslte_predecoding_ccd_2x2_mmse_sse(y, h, x, nof_symbols, scaling, noise_estimate);
#else
return srslte_predecoding_ccd_2x2_mmse_gen(y, h, x, nof_symbols, scaling, noise_estimate);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX */
if (csi && csi[0])
return srslte_predecoding_ccd_2x2_mmse_csi(y, h, x, csi, nof_symbols, scaling, noise_estimate);
else {
return srslte_predecoding_ccd_2x2_mmse(y, h, x, nof_symbols, scaling, noise_estimate);
}
} else {
DEBUG("Error predecoding CCD: Invalid number of layers %d\n", nof_layers);
return -1;
@ -1468,7 +1711,7 @@ void srslte_predecoding_set_mimo_decoder (srslte_mimo_decoder_t _mimo_decoder) {
/* 36.211 v10.3.0 Section 6.3.4 */
int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], float *csi, int nof_rxant, int nof_ports, int nof_layers,
cf_t *x[SRSLTE_MAX_LAYERS], float *csi[SRSLTE_MAX_CODEWORDS], int nof_rxant, int nof_ports, int nof_layers,
int codebook_idx, int nof_symbols, srslte_mimo_type_t type, float scaling,
float noise_estimate) {
@ -1488,10 +1731,10 @@ int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS]
if (nof_layers >= 2 && nof_layers <= 4) {
switch (mimo_decoder) {
case SRSLTE_MIMO_DECODER_ZF:
return srslte_predecoding_ccd_zf(y, h, x, nof_rxant, nof_ports, nof_layers, nof_symbols, scaling);
return srslte_predecoding_ccd_zf(y, h, x, csi, nof_rxant, nof_ports, nof_layers, nof_symbols, scaling);
break;
case SRSLTE_MIMO_DECODER_MMSE:
return srslte_predecoding_ccd_mmse(y, h, x, nof_rxant, nof_ports, nof_layers, nof_symbols, scaling, noise_estimate);
return srslte_predecoding_ccd_mmse(y, h, x, csi, nof_rxant, nof_ports, nof_layers, nof_symbols, scaling, noise_estimate);
break;
}
} else {
@ -1510,7 +1753,7 @@ int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS]
break;
case SRSLTE_MIMO_TYPE_TX_DIVERSITY:
if (nof_ports == nof_layers) {
return srslte_predecoding_diversity_multi(y, h, x, nof_rxant, nof_ports, nof_symbols, scaling);
return srslte_predecoding_diversity_multi(y, h, x, csi, nof_rxant, nof_ports, nof_symbols, scaling);
} else {
fprintf(stderr,
"Error number of layers must equal number of ports in transmit diversity\n");

@ -221,7 +221,7 @@ int srslte_pcfich_decode_multi(srslte_pcfich_t *q, cf_t *sf_symbols[SRSLTE_MAX_P
/* no need for layer demapping */
srslte_predecoding_single_multi(q_symbols, q_ce[0], q->d, NULL, q->nof_rx_antennas, q->nof_symbols, 1.0f, noise_estimate);
} else {
srslte_predecoding_diversity_multi(q_symbols, q_ce, x, q->nof_rx_antennas, q->cell.nof_ports, q->nof_symbols, 1.0f);
srslte_predecoding_diversity_multi(q_symbols, q_ce, x, NULL, q->nof_rx_antennas, q->cell.nof_ports, q->nof_symbols, 1.0f);
srslte_layerdemap_diversity(x, q->d, q->cell.nof_ports, q->nof_symbols / q->cell.nof_ports);
}

@ -492,7 +492,7 @@ int srslte_pdcch_extract_llr_multi(srslte_pdcch_t *q, cf_t *sf_symbols[SRSLTE_MA
/* no need for layer demapping */
srslte_predecoding_single_multi(q->symbols, q->ce[0], q->d, NULL, q->nof_rx_antennas, nof_symbols, 1.0f, noise_estimate/2);
} else {
srslte_predecoding_diversity_multi(q->symbols, q->ce, x, q->nof_rx_antennas, q->cell.nof_ports, nof_symbols, 1.0f);
srslte_predecoding_diversity_multi(q->symbols, q->ce, x, NULL, q->nof_rx_antennas, q->cell.nof_ports, nof_symbols, 1.0f);
srslte_layerdemap_diversity(x, q->d, q->cell.nof_ports, nof_symbols / q->cell.nof_ports);
}

@ -402,7 +402,7 @@ int srslte_pdsch_enable_csi(srslte_pdsch_t *q, bool enable) {
if (enable) {
for (int i = 0; i < SRSLTE_MAX_CODEWORDS; i++) {
if (!q->csi[i]) {
q->csi[i] = srslte_vec_malloc(sizeof(float) * q->max_re);
q->csi[i] = srslte_vec_malloc(sizeof(float) * q->max_re * 2);
if (!q->csi[i]) {
return SRSLTE_ERROR;
}
@ -757,7 +757,7 @@ int srslte_pdsch_decode(srslte_pdsch_t *q,
}
// Pre-decoder
if (srslte_predecoding_type(q->symbols, q->ce, x, q->csi[0], q->nof_rx_antennas, q->cell.nof_ports, cfg->nof_layers,
if (srslte_predecoding_type(q->symbols, q->ce, x, q->csi, q->nof_rx_antennas, q->cell.nof_ports, cfg->nof_layers,
cfg->codebook_idx, cfg->nbits[0].nof_re, cfg->mimo_type, pdsch_scaling, noise_estimate)<0) {
DEBUG("Error predecoding\n");
return SRSLTE_ERROR;

@ -241,7 +241,7 @@ int srslte_phich_decode(srslte_phich_t *q, cf_t *sf_symbols[SRSLTE_MAX_PORTS],
/* no need for layer demapping */
srslte_predecoding_single_multi(q_sf_symbols, q_ce[0], q->d0, NULL, q->nof_rx_antennas, SRSLTE_PHICH_MAX_NSYMB, 1.0f, noise_estimate);
} else {
srslte_predecoding_diversity_multi(q_sf_symbols, q_ce, x, q->nof_rx_antennas, q->cell.nof_ports, SRSLTE_PHICH_MAX_NSYMB, 1.0f);
srslte_predecoding_diversity_multi(q_sf_symbols, q_ce, x, NULL, q->nof_rx_antennas, q->cell.nof_ports, SRSLTE_PHICH_MAX_NSYMB, 1.0f);
srslte_layerdemap_diversity(x, q->d0, q->cell.nof_ports, SRSLTE_PHICH_MAX_NSYMB / q->cell.nof_ports);
}
DEBUG("Recv!!: \n");

@ -60,8 +60,8 @@ inline void srslte_mat_2x2_zf_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h10
}
/* Generic implementation for Minimum Mean Squared Error (MMSE) solver */
inline void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h10, cf_t h11,
cf_t *x0, cf_t *x1, float noise_estimate, float norm) {
inline void srslte_mat_2x2_mmse_csi_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h10, cf_t h11,
cf_t *x0, cf_t *x1, float *csi0, float *csi1, float noise_estimate, float norm) {
/* Create conjugated matrix */
cf_t _h00 = conjf(h00);
cf_t _h01 = conjf(h01);
@ -73,14 +73,14 @@ inline void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h
cf_t a01 = _h00 * h01 + _h10 * h11;
cf_t a10 = _h01 * h00 + _h11 * h10;
cf_t a11 = _h01 * h01 + _h11 * h11 + noise_estimate;
cf_t a_det_rcp = srslte_mat_cf_recip_gen(srslte_mat_2x2_det_gen(a00, a01, a10, a11));
/* 2. B = inv(H' x H + No) = inv(A) */
cf_t b00 = a11;
cf_t b01 = -a01;
cf_t b10 = -a10;
cf_t b11 = a00;
cf_t _norm = norm * srslte_mat_cf_recip_gen(srslte_mat_2x2_det_gen(a00, a01, a10, a11));
cf_t _norm = norm * a_det_rcp;
cf_t b00 = a11 * _norm;
cf_t b01 = -a01 * _norm;
cf_t b10 = -a10 * _norm;
cf_t b11 = a00 * _norm;
/* 3. W = inv(H' x H + No) x H' = B x H' */
cf_t w00 = b00 * _h00 + b01 * _h01;
@ -89,8 +89,19 @@ inline void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h
cf_t w11 = b10 * _h10 + b11 * _h11;
/* 4. X = W x Y */
*x0 = (y0 * w00 + y1 * w01) * _norm;
*x1 = (y0 * w10 + y1 * w11) * _norm;
*x0 = (y0 * w00 + y1 * w01);
*x1 = (y0 * w10 + y1 * w11);
/* 5. Set CSI */
*csi0 = 1.0f / crealf(b00);
*csi1 = 1.0f / crealf(b11);
}
/* Generic implementation for Minimum Mean Squared Error (MMSE) solver */
void srslte_mat_2x2_mmse_gen(cf_t y0, cf_t y1, cf_t h00, cf_t h01, cf_t h10, cf_t h11,
cf_t *x0, cf_t *x1, float noise_estimate, float norm) {
float csi0, csi1;
srslte_mat_2x2_mmse_csi_gen(y0, y1, h00, h01, h10, h11, x0, x1, &csi0, &csi1, noise_estimate, norm);
}
inline float srslte_mat_2x2_cn(cf_t h00, cf_t h01, cf_t h10, cf_t h11) {

@ -32,8 +32,8 @@
#include <sys/time.h>
#include "srslte/phy/utils/mat.h"
#include "srslte/phy/utils/simd.h"
#include "srslte/phy/utils/vector.h"
#include "srslte/phy/utils/vector_simd.h"
bool zf_solver = false;
@ -378,6 +378,98 @@ bool test_mmse_solver_avx(void) {
#endif /* LV_HAVE_AVX */
#if SRSLTE_SIMD_CF_SIZE != 0
bool test_zf_solver_simd(void) {
cf_t cf_error0, cf_error1;
float error = 0.0f;
cf_t x0_gold_1 = RANDOM_CF();
cf_t x1_gold_1 = RANDOM_CF();
cf_t h00_1 = RANDOM_CF();
cf_t h01_1 = RANDOM_CF();
cf_t h10_1 = RANDOM_CF();
cf_t h11_1 = (1 - h01_1 * h10_1) / h00_1;
cf_t y0_1 = x0_gold_1 * h00_1 + x1_gold_1 * h01_1;
cf_t y1_1 = x0_gold_1 * h10_1 + x1_gold_1 * h11_1;
simd_cf_t _y0 = srslte_simd_cf_set1(y0_1);
simd_cf_t _y1 = srslte_simd_cf_set1(y1_1);
simd_cf_t _h00 = srslte_simd_cf_set1(h00_1);
simd_cf_t _h01 = srslte_simd_cf_set1(h01_1);
simd_cf_t _h10 = srslte_simd_cf_set1(h10_1);
simd_cf_t _h11 = srslte_simd_cf_set1(h11_1);
simd_cf_t _x0, _x1;
srslte_mat_2x2_zf_simd(_y0, _y1, _h00, _h01, _h10, _h11, &_x0, &_x1, 1.0f);
__attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN))) cf_t x0[SRSLTE_SIMD_CF_SIZE];
__attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN))) cf_t x1[SRSLTE_SIMD_CF_SIZE];
srslte_simd_cfi_store(x0, _x0);
srslte_simd_cfi_store(x1, _x1);
cf_error0 = x0[1] - x0_gold_1;
cf_error1 = x1[1] - x1_gold_1;
error += crealf(cf_error0) * crealf(cf_error0) + cimagf(cf_error0) * cimagf(cf_error0) +
crealf(cf_error1) * crealf(cf_error1) + cimagf(cf_error1) * cimagf(cf_error1);
return (error < 1e-3);
}
bool test_mmse_solver_simd(void) {
cf_t cf_error0, cf_error1;
float error = 0.0f;
cf_t x0_gold[SRSLTE_SIMD_CF_SIZE];
cf_t x1_gold[SRSLTE_SIMD_CF_SIZE];
cf_t h00[SRSLTE_SIMD_CF_SIZE];
cf_t h01[SRSLTE_SIMD_CF_SIZE];
cf_t h10[SRSLTE_SIMD_CF_SIZE];
cf_t h11[SRSLTE_SIMD_CF_SIZE];
cf_t y0[SRSLTE_SIMD_CF_SIZE];
cf_t y1[SRSLTE_SIMD_CF_SIZE];
for (int i = 0; i < SRSLTE_SIMD_CF_SIZE; i++) {
x0_gold[i] = RANDOM_CF();
x1_gold[i] = RANDOM_CF();
h00[i] = RANDOM_CF();
h01[i] = RANDOM_CF();
h10[i] = RANDOM_CF();
h11[i] = (1 - h01[i] * h10[i]) / h00[i];
y0[i] = x0_gold[i] * h00[i]+ x1_gold[i] * h01[i];
y1[i] = x0_gold[i] * h10[i] + x1_gold[i] * h11[i];
}
simd_cf_t _y0 = srslte_simd_cfi_loadu(y0);
simd_cf_t _y1 = srslte_simd_cfi_loadu(y1);
simd_cf_t _h00 = srslte_simd_cfi_loadu(h00);
simd_cf_t _h01 = srslte_simd_cfi_loadu(h01);
simd_cf_t _h10 = srslte_simd_cfi_loadu(h10);
simd_cf_t _h11 = srslte_simd_cfi_loadu(h11);
simd_cf_t _x0, _x1;
srslte_mat_2x2_mmse_simd(_y0, _y1, _h00, _h01, _h10, _h11, &_x0, &_x1, 0.0f, 1.0f);
__attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN))) cf_t x0[SRSLTE_SIMD_CF_SIZE];
__attribute__((aligned(SRSLTE_SIMD_BIT_ALIGN))) cf_t x1[SRSLTE_SIMD_CF_SIZE];
srslte_simd_cfi_store(x0, _x0);
srslte_simd_cfi_store(x1, _x1);
cf_error0 = x0[1] - x0_gold[1];
cf_error1 = x1[1] - x1_gold[1];
error += crealf(cf_error0) * crealf(cf_error0) + cimagf(cf_error0) * cimagf(cf_error0) +
crealf(cf_error1) * crealf(cf_error1) + cimagf(cf_error1) * cimagf(cf_error1);
return (error < 1e-3);
}
#endif /* SRSLTE_SIMD_CF_SIZE != 0 */
bool test_vec_dot_prod_ccc(void) {
__attribute__((aligned(256))) cf_t a[14];
__attribute__((aligned(256))) cf_t b[14];
@ -413,6 +505,10 @@ int main(int argc, char **argv) {
#ifdef LV_HAVE_AVX
RUN_TEST(test_zf_solver_avx);
#endif /* LV_HAVE_AVX */
#if SRSLTE_SIMD_CF_SIZE != 0
RUN_TEST(test_zf_solver_simd);
#endif /* SRSLTE_SIMD_CF_SIZE != 0*/
}
if (mmse_solver) {
@ -426,6 +522,10 @@ int main(int argc, char **argv) {
#ifdef LV_HAVE_AVX
RUN_TEST(test_mmse_solver_avx);
#endif /* LV_HAVE_AVX */
#if SRSLTE_SIMD_CF_SIZE != 0
RUN_TEST(test_mmse_solver_simd);
#endif /* SRSLTE_SIMD_CF_SIZE != 0*/
}
RUN_TEST(test_vec_dot_prod_ccc);

@ -427,8 +427,8 @@ void srslte_vec_quant_sus(const int16_t *in, uint16_t *out, const float gain, co
}
}
void srs_vec_cf_cpy(const cf_t *dst, cf_t *src, int len) {
srslte_vec_cp_simd(dst, src, len);
void srs_vec_cf_cpy(const cf_t *src, cf_t *dst, int len) {
srslte_vec_cp_simd(src, dst, len);
}
void srslte_vec_interleave(const cf_t *x, const cf_t *y, cf_t *z, const int len) {

Loading…
Cancel
Save