Added CSI Predecoding for TM4 with SIMD Support

master
Xavier Arteaga 7 years ago
parent bad1291843
commit 7146819fcd

@ -1118,14 +1118,17 @@ int srslte_predecoding_ccd_mmse(cf_t *y[SRSLTE_MAX_PORTS],
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
#ifdef LV_HAVE_AVX static int srslte_predecoding_multiplex_2x2_zf_csi(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer cf_t *x[SRSLTE_MAX_LAYERS],
int srslte_predecoding_multiplex_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], float *csi,
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) { int codebook_idx,
float norm = 1.0; int nof_symbols,
float scaling) {
float norm = 1.0f;
int i = 0;
switch(codebook_idx) { switch (codebook_idx) {
case 0: case 0:
norm = (float) M_SQRT2 / scaling; norm = (float) M_SQRT2 / scaling;
break; break;
@ -1138,62 +1141,103 @@ int srslte_predecoding_multiplex_2x2_zf_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[S
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
for (int i = 0; i < nof_symbols - 3; i += 4) { #if SRSLTE_SIMD_CF_SIZE != 0
__m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i])); for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
__m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i])); simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
__m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i])); simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
__m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i])); simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
__m256 h00, h01, h10, h11; simd_cf_t h00, h01, h10, h11;
switch (codebook_idx) { switch (codebook_idx) {
case 0: case 0:
h00 = _h00; h00 = h00i;
h01 = _h10; h01 = h10i;
h10 = _h01; h10 = h01i;
h11 = _h11; h11 = h11i;
break; break;
case 1: case 1:
h00 = _mm256_add_ps(_h00, _h10); h00 = srslte_simd_cf_add(h00i, h10i);
h01 = _mm256_sub_ps(_h00, _h10); h01 = srslte_simd_cf_sub(h00i, h10i);
h10 = _mm256_add_ps(_h01, _h11); h10 = srslte_simd_cf_add(h01i, h11i);
h11 = _mm256_sub_ps(_h01, _h11); h11 = srslte_simd_cf_sub(h01i, h11i);
break; break;
case 2: case 2:
h00 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10)); h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
h01 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10)); h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
h10 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11)); h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
h11 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11)); h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
break; break;
default: default:
DEBUG("Wrong codebook_idx=%d\n", codebook_idx); fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
__m256 y0 = _mm256_load_ps((float *) &y[0][i]); simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
__m256 y1 = _mm256_load_ps((float *) &y[1][i]); simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
__m256 x0, x1;
srslte_mat_2x2_zf_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, norm); simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm);
_mm256_store_ps((float *) &x[0][i], x0); srslte_simd_cfi_store(&x[0][i], x0);
_mm256_store_ps((float *) &x[1][i], x1); srslte_simd_cfi_store(&x[1][i], x1);
srslte_simd_f_store(&csi[i], csi0);
srslte_simd_f_store(&csi[i], csi1);
} }
#endif /* SRSLTE_SIMD_CF_SIZE */
return SRSLTE_SUCCESS; for (; i < nof_symbols; i++) {
} cf_t h00, h01, h10, h11;
#endif /* LV_HAVE_AVX */ switch (codebook_idx) {
case 0:
h00 = h[0][0][i];
h01 = h[1][0][i];
h10 = h[0][1][i];
h11 = h[1][1][i];
break;
case 1:
h00 = h[0][0][i] + h[1][0][i];
h01 = h[0][0][i] - h[1][0][i];
h10 = h[0][1][i] + h[1][1][i];
h11 = h[0][1][i] - h[1][1][i];
break;
case 2:
h00 = h[0][0][i] + _Complex_I * h[1][0][i];
h01 = h[0][0][i] - _Complex_I * h[1][0][i];
h10 = h[0][1][i] + _Complex_I * h[1][1][i];
h11 = h[0][1][i] - _Complex_I * h[1][1][i];
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
#ifdef LV_HAVE_SSE cf_t det = (h00 * h11 - h01 * h10);
det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det)));
// SSE implementation of ZF 2x2 Spatial Multiplexity equalizer x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det;
int srslte_predecoding_multiplex_2x2_zf_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det;
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
float norm = 1.0;
switch(codebook_idx) { csi[i] = 1.0f;
csi[i] = 1.0f;
}
return SRSLTE_SUCCESS;
}
// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
static int srslte_predecoding_multiplex_2x2_zf(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
int codebook_idx,
int nof_symbols,
float scaling) {
float norm = 1.0f;
int i = 0;
switch (codebook_idx) {
case 0: case 0:
norm = (float) M_SQRT2 / scaling; norm = (float) M_SQRT2 / scaling;
break; break;
@ -1206,77 +1250,55 @@ int srslte_predecoding_multiplex_2x2_zf_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[S
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
for (int i = 0; i < nof_symbols - 1; i += 2) { #if SRSLTE_SIMD_CF_SIZE != 0
__m128 _h00 = _mm_load_ps((float*)&(h[0][0][i])); for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
__m128 _h01 = _mm_load_ps((float*)&(h[0][1][i])); simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
__m128 _h10 = _mm_load_ps((float*)&(h[1][0][i])); simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
__m128 _h11 = _mm_load_ps((float*)&(h[1][1][i])); simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
__m128 h00, h01, h10, h11; simd_cf_t h00, h01, h10, h11;
switch (codebook_idx) { switch (codebook_idx) {
case 0: case 0:
h00 = _h00; h00 = h00i;
h01 = _h10; h01 = h10i;
h10 = _h01; h10 = h01i;
h11 = _h11; h11 = h11i;
break; break;
case 1: case 1:
h00 = _mm_add_ps(_h00, _h10); h00 = srslte_simd_cf_add(h00i, h10i);
h01 = _mm_sub_ps(_h00, _h10); h01 = srslte_simd_cf_sub(h00i, h10i);
h10 = _mm_add_ps(_h01, _h11); h10 = srslte_simd_cf_add(h01i, h11i);
h11 = _mm_sub_ps(_h01, _h11); h11 = srslte_simd_cf_sub(h01i, h11i);
break; break;
case 2: case 2:
h00 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10)); h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
h01 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10)); h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
h10 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11)); h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
h11 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11)); h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
break; break;
default: default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
__m128 y0 = _mm_load_ps((float *) &y[0][i]); simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
__m128 y1 = _mm_load_ps((float *) &y[1][i]); simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
__m128 x0, x1;
srslte_mat_2x2_zf_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, norm);
_mm_store_ps((float *) &x[0][i], x0);
_mm_store_ps((float *) &x[1][i], x1);
}
return SRSLTE_SUCCESS;
}
#endif /* LV_HAVE_SSE */
simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_zf_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, norm);
// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer srslte_simd_cfi_store(&x[0][i], x0);
int srslte_predecoding_multiplex_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], srslte_simd_cfi_store(&x[1][i], x1);
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
float norm = 1.0;
switch(codebook_idx) {
case 0:
norm = (float) M_SQRT2 / scaling;
break;
case 1:
case 2:
norm = 2.0f / scaling;
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
} }
#endif /* SRSLTE_SIMD_CF_SIZE */
for (int i = 0; i < nof_symbols; i++) { for (; i < nof_symbols; i++) {
cf_t h00, h01, h10, h11, det; cf_t h00, h01, h10, h11;
switch(codebook_idx) { switch (codebook_idx) {
case 0: case 0:
h00 = h[0][0][i]; h00 = h[0][0][i];
h01 = h[1][0][i]; h01 = h[1][0][i];
@ -1290,34 +1312,34 @@ int srslte_predecoding_multiplex_2x2_zf_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[S
h11 = h[0][1][i] - h[1][1][i]; h11 = h[0][1][i] - h[1][1][i];
break; break;
case 2: case 2:
h00 = h[0][0][i] + _Complex_I*h[1][0][i]; h00 = h[0][0][i] + _Complex_I * h[1][0][i];
h01 = h[0][0][i] - _Complex_I*h[1][0][i]; h01 = h[0][0][i] - _Complex_I * h[1][0][i];
h10 = h[0][1][i] + _Complex_I*h[1][1][i]; h10 = h[0][1][i] + _Complex_I * h[1][1][i];
h11 = h[0][1][i] - _Complex_I*h[1][1][i]; h11 = h[0][1][i] - _Complex_I * h[1][1][i];
break; break;
default: default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
det = (h00 * h11 - h01 * h10); srslte_mat_2x2_zf_gen(y[0][i], y[1][i], h00, h01, h10, h11, &x[0][i], &x[1][i], norm);
det = conjf(det) * (norm / (crealf(det) * crealf(det) + cimagf(det) * cimagf(det)));
x[0][i] = (+h11 * y[0][i] - h01 * y[1][i]) * det;
x[1][i] = (-h10 * y[0][i] + h00 * y[1][i]) * det;
} }
return SRSLTE_SUCCESS; return SRSLTE_SUCCESS;
} }
#ifdef LV_HAVE_AVX // Generic implementation of ZF 2x2 Spatial Multiplexity equalizer
static int srslte_predecoding_multiplex_2x2_mmse_csi(cf_t *y[SRSLTE_MAX_PORTS],
// AVX implementation of ZF 2x2 Spatial Multiplexity equalizer cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
int srslte_predecoding_multiplex_2x2_mmse_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float *csi[SRSLTE_MAX_CODEWORDS],
float scaling, float noise_estimate) { int codebook_idx,
float norm = 1.0; int nof_symbols,
float scaling,
float noise_estimate) {
float norm = 1.0f;
int i = 0;
switch(codebook_idx) { switch (codebook_idx) {
case 0: case 0:
norm = (float) M_SQRT2 / scaling; norm = (float) M_SQRT2 / scaling;
break; break;
@ -1326,132 +1348,109 @@ int srslte_predecoding_multiplex_2x2_mmse_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h
norm = 2.0f / scaling; norm = 2.0f / scaling;
break; break;
default: default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); ERROR("Wrong codebook_idx=%d", codebook_idx);
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
for (int i = 0; i < nof_symbols; i += 4) { #if SRSLTE_SIMD_CF_SIZE != 0
__m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i])); for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
__m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i])); simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
__m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i])); simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
__m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i])); simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
__m256 h00, h01, h10, h11; simd_cf_t h00, h01, h10, h11;
switch (codebook_idx) { switch (codebook_idx) {
case 0: case 0:
h00 = _h00; h00 = h00i;
h01 = _h10; h01 = h10i;
h10 = _h01; h10 = h01i;
h11 = _h11; h11 = h11i;
break; break;
case 1: case 1:
h00 = _mm256_add_ps(_h00, _h10); h00 = srslte_simd_cf_add(h00i, h10i);
h01 = _mm256_sub_ps(_h00, _h10); h01 = srslte_simd_cf_sub(h00i, h10i);
h10 = _mm256_add_ps(_h01, _h11); h10 = srslte_simd_cf_add(h01i, h11i);
h11 = _mm256_sub_ps(_h01, _h11); h11 = srslte_simd_cf_sub(h01i, h11i);
break; break;
case 2: case 2:
h00 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10)); h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
h01 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10)); h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
h10 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11)); h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
h11 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11)); h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
break; break;
default: default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
__m256 y0 = _mm256_load_ps((float *) &y[0][i]); simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
__m256 y1 = _mm256_load_ps((float *) &y[1][i]); simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
__m256 x0, x1;
srslte_mat_2x2_mmse_avx(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm);
_mm256_store_ps((float *) &x[0][i], x0);
_mm256_store_ps((float *) &x[1][i], x1);
}
return SRSLTE_SUCCESS;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm);
// SSE implementation of ZF 2x2 Spatial Multiplexity equalizer srslte_simd_cfi_store(&x[0][i], x0);
int srslte_predecoding_multiplex_2x2_mmse_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], srslte_simd_cfi_store(&x[1][i], x1);
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols,
float scaling, float noise_estimate) {
float norm;
switch(codebook_idx) { srslte_simd_f_store(&csi[0][i], csi0);
case 0: srslte_simd_f_store(&csi[1][i], csi1);
norm = (float) M_SQRT2 / scaling;
break;
case 1:
case 2:
norm = 2.0f / scaling;
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
} }
#endif /* SRSLTE_SIMD_CF_SIZE */
for (int i = 0; i < nof_symbols - 1; i += 2) { for (; i < nof_symbols; i++) {
__m128 _h00 = _mm_load_ps((float*)&(h[0][0][i])); cf_t h00, h01, h10, h11;
__m128 _h01 = _mm_load_ps((float*)&(h[0][1][i]));
__m128 _h10 = _mm_load_ps((float*)&(h[1][0][i]));
__m128 _h11 = _mm_load_ps((float*)&(h[1][1][i]));
__m128 h00, h01, h10, h11;
switch (codebook_idx) { switch (codebook_idx) {
case 0: case 0:
h00 = _h00; h00 = h[0][0][i];
h01 = _h10; h01 = h[1][0][i];
h10 = _h01; h10 = h[0][1][i];
h11 = _h11; h11 = h[1][1][i];
break; break;
case 1: case 1:
h00 = _mm_add_ps(_h00, _h10); h00 = h[0][0][i] + h[1][0][i];
h01 = _mm_sub_ps(_h00, _h10); h01 = h[0][0][i] - h[1][0][i];
h10 = _mm_add_ps(_h01, _h11); h10 = h[0][1][i] + h[1][1][i];
h11 = _mm_sub_ps(_h01, _h11); h11 = h[0][1][i] - h[1][1][i];
break; break;
case 2: case 2:
h00 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10)); h00 = h[0][0][i] + _Complex_I * h[1][0][i];
h01 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10)); h01 = h[0][0][i] - _Complex_I * h[1][0][i];
h10 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11)); h10 = h[0][1][i] + _Complex_I * h[1][1][i];
h11 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11)); h11 = h[0][1][i] - _Complex_I * h[1][1][i];
break; break;
default: default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
__m128 y0 = _mm_load_ps((float *) &y[0][i]); srslte_mat_2x2_mmse_csi_gen(y[0][i],
__m128 y1 = _mm_load_ps((float *) &y[1][i]); y[1][i],
h00,
__m128 x0, x1; h01,
h10,
srslte_mat_2x2_mmse_sse(y0, y1, h00, h01, h10, h11, &x0, &x1, noise_estimate, norm); h11,
&x[0][i],
_mm_store_ps((float *) &x[0][i], x0); &x[1][i],
_mm_store_ps((float *) &x[1][i], x1); &csi[0][i],
&csi[1][i],
noise_estimate,
norm);
} }
return SRSLTE_SUCCESS; return SRSLTE_SUCCESS;
} }
#endif /* LV_HAVE_SSE */
// Generic implementation of ZF 2x2 Spatial Multiplexity equalizer static int srslte_predecoding_multiplex_2x2_mmse(cf_t *y[SRSLTE_MAX_PORTS],
int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, cf_t *x[SRSLTE_MAX_LAYERS],
float scaling, float noise_estimate) { int codebook_idx,
int nof_symbols,
float scaling,
float noise_estimate) {
float norm = 1.0; float norm = 1.0;
int i = 0;
switch(codebook_idx) { switch(codebook_idx) {
case 0: case 0:
@ -1466,7 +1465,51 @@ int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
for (int i = 0; i < nof_symbols; i++) { #if SRSLTE_SIMD_CF_SIZE != 0
for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t h00i = srslte_simd_cfi_load(&h[0][0][i]);
simd_cf_t h01i = srslte_simd_cfi_load(&h[0][1][i]);
simd_cf_t h10i = srslte_simd_cfi_load(&h[1][0][i]);
simd_cf_t h11i = srslte_simd_cfi_load(&h[1][1][i]);
simd_cf_t h00, h01, h10, h11;
switch(codebook_idx) {
case 0:
h00 = h00i;
h01 = h10i;
h10 = h01i;
h11 = h11i;
break;
case 1:
h00 = srslte_simd_cf_add(h00i, h10i);
h01 = srslte_simd_cf_sub(h00i, h10i);
h10 = srslte_simd_cf_add(h01i, h11i);
h11 = srslte_simd_cf_sub(h01i, h11i);
break;
case 2:
h00 = srslte_simd_cf_add(h00i, srslte_simd_cf_mulj(h10i));
h01 = srslte_simd_cf_sub(h00i, srslte_simd_cf_mulj(h10i));
h10 = srslte_simd_cf_add(h01i, srslte_simd_cf_mulj(h11i));
h11 = srslte_simd_cf_sub(h01i, srslte_simd_cf_mulj(h11i));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
simd_cf_t y0 = srslte_simd_cfi_load(&y[0][i]);
simd_cf_t y1 = srslte_simd_cfi_load(&y[1][i]);
simd_cf_t x0, x1;
simd_f_t csi0, csi1;
srslte_mat_2x2_mmse_csi_simd(y0, y1, h00, h01, h10, h11, &x0, &x1, &csi0, &csi1, noise_estimate, norm);
srslte_simd_cfi_store(&x[0][i], x0);
srslte_simd_cfi_store(&x[1][i], x1);
}
#endif /* SRSLTE_SIMD_CF_SIZE */
for (; i < nof_symbols; i++) {
cf_t h00, h01, h10, h11; cf_t h00, h01, h10, h11;
switch(codebook_idx) { switch(codebook_idx) {
@ -1498,134 +1541,147 @@ int srslte_predecoding_multiplex_2x2_mmse_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h
return SRSLTE_SUCCESS; return SRSLTE_SUCCESS;
} }
#ifdef LV_HAVE_AVX
// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
int srslte_predecoding_multiplex_2x1_mrc_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
for (int i = 0; i < nof_symbols - 3; i += 4) { // Implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
__m256 _h00 = _mm256_load_ps((float*)&(h[0][0][i])); static int srslte_predecoding_multiplex_2x1_mrc(cf_t *y[SRSLTE_MAX_PORTS],
__m256 _h01 = _mm256_load_ps((float*)&(h[0][1][i])); cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
__m256 _h10 = _mm256_load_ps((float*)&(h[1][0][i])); cf_t *x[SRSLTE_MAX_LAYERS],
__m256 _h11 = _mm256_load_ps((float*)&(h[1][1][i])); int codebook_idx,
int nof_symbols,
float scaling) {
float norm = (float) M_SQRT2 / scaling;
int i = 0;
__m256 h0, h1; #if SRSLTE_SIMD_CF_SIZE != 0
switch (codebook_idx) { simd_f_t _norm = srslte_simd_f_set1(norm);
case 0:
h0 = _mm256_add_ps(_h00, _h10);
h1 = _mm256_add_ps(_h01, _h11);
break;
case 1:
h0 = _mm256_sub_ps(_h00, _h10);
h1 = _mm256_sub_ps(_h01, _h11);
break;
case 2:
h0 = _mm256_add_ps(_h00, _MM256_MULJ_PS(_h10));
h1 = _mm256_add_ps(_h01, _MM256_MULJ_PS(_h11));
break;
case 3:
h0 = _mm256_sub_ps(_h00, _MM256_MULJ_PS(_h10));
h1 = _mm256_sub_ps(_h01, _MM256_MULJ_PS(_h11));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
__m256 h0_2 = _mm256_mul_ps(h0, h0); for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
__m256 h1_2 = _mm256_mul_ps(h1, h1); simd_cf_t x0 = srslte_simd_cf_set1(0.0f);
__m256 hh0 = _mm256_add_ps(_mm256_movehdup_ps(h0_2), _mm256_moveldup_ps(h0_2)); simd_f_t hh = srslte_simd_f_set1(0.0f);
__m256 hh1 = _mm256_add_ps(_mm256_movehdup_ps(h1_2), _mm256_moveldup_ps(h1_2));
__m256 hh = _mm256_add_ps(hh0, hh1);
__m256 hhrec = _mm256_rcp_ps(hh);
hhrec = _mm256_mul_ps(hhrec, _mm256_set1_ps((float) M_SQRT2 / scaling)); for (int k = 0; k < 2; k++) {
__m256 y0 = _mm256_load_ps((float*)&y[0][i]); simd_cf_t h0xi = srslte_simd_cfi_load(&h[0][k][i]);
__m256 y1 = _mm256_load_ps((float*)&y[1][i]); simd_cf_t h1xi = srslte_simd_cfi_load(&h[1][k][i]);
simd_cf_t yx = srslte_simd_cfi_load(&y[k][i]);
__m256 x0 = _mm256_add_ps(_MM256_PROD_PS(_MM256_CONJ_PS(h0), y0), _MM256_PROD_PS(_MM256_CONJ_PS(h1), y1)); simd_cf_t hx;
x0 = _mm256_mul_ps(hhrec, x0); switch (codebook_idx) {
case 0:
hx = srslte_simd_cf_add(h0xi, h1xi);
break;
case 1:
hx = srslte_simd_cf_sub(h0xi, h1xi);
break;
case 2:
hx = srslte_simd_cf_add(h0xi, srslte_simd_cf_mulj(h1xi));
break;
case 3:
hx = srslte_simd_cf_sub(h0xi, srslte_simd_cf_mulj(h1xi));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
_mm256_store_ps((float*)&x[0][i], x0); hh = srslte_simd_f_add(srslte_simd_cf_re(srslte_simd_cf_conjprod(hx, hx)), hh);
x0 = srslte_simd_cf_add(srslte_simd_cf_conjprod(yx, hx), x0);
}
hh = srslte_simd_f_mul(_norm, srslte_simd_f_rcp(hh));
srslte_simd_cfi_store(&x[0][i], srslte_simd_cf_mul(x0, hh));
} }
#endif /* SRSLTE_SIMD_CF_SIZE */
return SRSLTE_SUCCESS; for (; i < nof_symbols; i += 1) {
} cf_t h0, h1;
float hh;
#endif /* LV_HAVE_AVX */
// SSE implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
#ifdef LV_HAVE_SSE
int srslte_predecoding_multiplex_2x1_mrc_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) {
for (int i = 0; i < nof_symbols - 1; i += 2) {
__m128 _h00 = _mm_load_ps((float*)&(h[0][0][i]));
__m128 _h01 = _mm_load_ps((float*)&(h[0][1][i]));
__m128 _h10 = _mm_load_ps((float*)&(h[1][0][i]));
__m128 _h11 = _mm_load_ps((float*)&(h[1][1][i]));
__m128 h0, h1;
switch (codebook_idx) { switch (codebook_idx) {
case 0: case 0:
h0 = _mm_add_ps(_h00, _h10); h0 = h[0][0][i] + h[1][0][i];
h1 = _mm_add_ps(_h01, _h11); h1 = h[0][1][i] + h[1][1][i];
break; break;
case 1: case 1:
h0 = _mm_sub_ps(_h00, _h10); h0 = h[0][0][i] - h[1][0][i];
h1 = _mm_sub_ps(_h01, _h11); h1 = h[0][1][i] - h[1][1][i];
break; break;
case 2: case 2:
h0 = _mm_add_ps(_h00, _MM_MULJ_PS(_h10)); h0 = h[0][0][i] + _Complex_I * h[1][0][i];
h1 = _mm_add_ps(_h01, _MM_MULJ_PS(_h11)); h1 = h[0][1][i] + _Complex_I * h[1][1][i];
break; break;
case 3: case 3:
h0 = _mm_sub_ps(_h00, _MM_MULJ_PS(_h10)); h0 = h[0][0][i] - _Complex_I * h[1][0][i];
h1 = _mm_sub_ps(_h01, _MM_MULJ_PS(_h11)); h1 = h[0][1][i] - _Complex_I * h[1][1][i];
break; break;
default: default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx); fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
__m128 h0_2 = _mm_mul_ps(h0, h0); hh = norm / (crealf(h0) * crealf(h0) + cimagf(h0) * cimagf(h0) + crealf(h1) * crealf(h1) + cimagf(h1) * cimagf(h1));
__m128 h1_2 = _mm_mul_ps(h1, h1);
__m128 hh0 = _mm_add_ps(_mm_movehdup_ps(h0_2), _mm_moveldup_ps(h0_2));
__m128 hh1 = _mm_add_ps(_mm_movehdup_ps(h1_2), _mm_moveldup_ps(h1_2));
__m128 hh = _mm_add_ps(hh0, hh1);
__m128 hhrec = _mm_rcp_ps(hh);
hhrec = _mm_mul_ps(hhrec, _mm_set1_ps((float) M_SQRT2 / scaling)); x[0][i] = (conjf(h0) * y[0][i] + conjf(h1) * y[1][i]) * hh;
}
return SRSLTE_SUCCESS;
}
__m128 y0 = _mm_load_ps((float*)&y[0][i]); // Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer
__m128 y1 = _mm_load_ps((float*)&y[1][i]); static int srslte_predecoding_multiplex_2x1_mrc_csi(cf_t *y[SRSLTE_MAX_PORTS],
cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
cf_t *x[SRSLTE_MAX_LAYERS],
float *csi,
int codebook_idx,
int nof_symbols,
float scaling) {
float norm = (float) M_SQRT2 / scaling;
int i = 0;
__m128 x0 = _mm_add_ps(_MM_PROD_PS(_MM_CONJ_PS(h0), y0), _MM_PROD_PS(_MM_CONJ_PS(h1), y1)); #if SRSLTE_SIMD_CF_SIZE != 0
x0 = _mm_mul_ps(hhrec, x0); simd_f_t _norm = srslte_simd_f_set1(norm);
_mm_store_ps((float*)&x[0][i], x0); for (; i < nof_symbols - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
simd_cf_t x0 = srslte_simd_cf_set1(0.0f);
simd_f_t hh = srslte_simd_f_set1(0.0f);
} for (int k = 0; k < 2; k++) {
simd_cf_t h0xi = srslte_simd_cfi_load(&h[0][k][i]);
simd_cf_t h1xi = srslte_simd_cfi_load(&h[1][k][i]);
simd_cf_t yx = srslte_simd_cfi_load(&y[k][i]);
return SRSLTE_SUCCESS; simd_cf_t hx;
} switch (codebook_idx) {
case 0:
hx = srslte_simd_cf_add(h0xi, h1xi);
break;
case 1:
hx = srslte_simd_cf_sub(h0xi, h1xi);
break;
case 2:
hx = srslte_simd_cf_add(h0xi, srslte_simd_cf_mulj(h1xi));
break;
case 3:
hx = srslte_simd_cf_sub(h0xi, srslte_simd_cf_mulj(h1xi));
break;
default:
fprintf(stderr, "Wrong codebook_idx=%d\n", codebook_idx);
return SRSLTE_ERROR;
}
#endif /* LV_HAVE_SSE */ hh = srslte_simd_f_add(srslte_simd_cf_re(srslte_simd_cf_conjprod(hx, hx)), hh);
x0 = srslte_simd_cf_add(srslte_simd_cf_conjprod(yx, hx), x0);
}
// Generic implementation of MRC 2x1 (two antennas into one layer) Spatial Multiplexing equalizer hh = srslte_simd_f_mul(_norm, srslte_simd_f_rcp(hh));
int srslte_predecoding_multiplex_2x1_mrc_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], srslte_simd_cfi_store(&x[0][i], srslte_simd_cf_mul(x0, hh));
cf_t *x[SRSLTE_MAX_LAYERS], int codebook_idx, int nof_symbols, float scaling) { srslte_simd_f_store(&csi[i], srslte_simd_f_mul(srslte_simd_f_rcp(hh), srslte_simd_f_set1((float) M_SQRT1_2)));
float norm = (float) M_SQRT2 / scaling; }
#endif /* SRSLTE_SIMD_CF_SIZE */
for (int i = 0; i < nof_symbols; i += 1) { for (; i < nof_symbols; i += 1) {
cf_t h0, h1; cf_t h0, h1;
float hh; float hh, _csi;
switch(codebook_idx) { switch (codebook_idx) {
case 0: case 0:
h0 = h[0][0][i] + h[1][0][i]; h0 = h[0][0][i] + h[1][0][i];
h1 = h[0][1][i] + h[1][1][i]; h1 = h[0][1][i] + h[1][1][i];
@ -1647,53 +1703,57 @@ int srslte_predecoding_multiplex_2x1_mrc_gen(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
hh = norm / (crealf(h0) * crealf(h0) + cimagf(h0) * cimagf(h0) + crealf(h1) * crealf(h1) + cimagf(h1) * cimagf(h1)); _csi = crealf(h0) * crealf(h0) + cimagf(h0) * cimagf(h0) + crealf(h1) * crealf(h1) + cimagf(h1) * cimagf(h1);
hh = norm / _csi;
x[0][i] = (conjf(h0) * y[0][i] + conjf(h1) * y[1][i]) * hh; x[0][i] = (conjf(h0) * y[0][i] + conjf(h1) * y[1][i]) * hh;
csi[i] = _csi / norm * (float) M_SQRT1_2;
} }
return SRSLTE_SUCCESS; return SRSLTE_SUCCESS;
} }
int srslte_predecoding_multiplex(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], static int srslte_predecoding_multiplex(cf_t *y[SRSLTE_MAX_PORTS],
int nof_rxant, int nof_ports, int nof_layers, int codebook_idx, int nof_symbols, cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS],
float scaling, float noise_estimate) cf_t *x[SRSLTE_MAX_LAYERS],
{ float *csi[SRSLTE_MAX_CODEWORDS],
int nof_rxant,
int nof_ports,
int nof_layers,
int codebook_idx,
int nof_symbols,
float scaling,
float noise_estimate) {
if (nof_ports == 2 && nof_rxant <= 2) { if (nof_ports == 2 && nof_rxant <= 2) {
if (nof_layers == 2) { if (nof_layers == 2) {
switch (mimo_decoder) { switch (mimo_decoder) {
case SRSLTE_MIMO_DECODER_ZF: case SRSLTE_MIMO_DECODER_ZF:
#ifdef LV_HAVE_AVX if (csi && csi[0]) {
return srslte_predecoding_multiplex_2x2_zf_avx(y, h, x, codebook_idx, nof_symbols, scaling); return srslte_predecoding_multiplex_2x2_zf_csi(y, h, x, csi[0], codebook_idx, nof_symbols, scaling);
#else } else {
#ifdef LV_HAVE_SSE return srslte_predecoding_multiplex_2x2_zf(y, h, x, codebook_idx, nof_symbols, scaling);
return srslte_predecoding_multiplex_2x2_zf_sse(y, h, x, codebook_idx, nof_symbols, scaling); }
#else
return srslte_predecoding_multiplex_2x2_zf_gen(y, h, x, codebook_idx, nof_symbols, scaling);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX */
break; break;
case SRSLTE_MIMO_DECODER_MMSE: case SRSLTE_MIMO_DECODER_MMSE:
#ifdef LV_HAVE_AVX if (csi && csi[0]) {
return srslte_predecoding_multiplex_2x2_mmse_avx(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate); return srslte_predecoding_multiplex_2x2_mmse_csi(y,
#else h,
#ifdef LV_HAVE_SSE x,
return srslte_predecoding_multiplex_2x2_mmse_sse(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate); csi,
#else codebook_idx,
return srslte_predecoding_multiplex_2x2_mmse_gen(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate); nof_symbols,
#endif /* LV_HAVE_SSE */ scaling,
#endif /* LV_HAVE_AVX */ noise_estimate);
} else {
return srslte_predecoding_multiplex_2x2_mmse(y, h, x, codebook_idx, nof_symbols, scaling, noise_estimate);
}
break; break;
} }
} else { } else {
#ifdef LV_HAVE_AVX if (csi && csi[0]) {
return srslte_predecoding_multiplex_2x1_mrc_avx(y, h, x, codebook_idx, nof_symbols, scaling); return srslte_predecoding_multiplex_2x1_mrc_csi(y, h, x, csi[0], codebook_idx, nof_symbols, scaling);
#else } else {
#ifdef LV_HAVE_SSE return srslte_predecoding_multiplex_2x1_mrc(y, h, x, codebook_idx, nof_symbols, scaling);
return srslte_predecoding_multiplex_2x1_mrc_sse(y, h, x, codebook_idx, nof_symbols, scaling); }
#else
return srslte_predecoding_multiplex_2x1_mrc_gen(y, h, x, codebook_idx, nof_symbols, scaling);
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX */
} }
} else if (nof_ports == 4) { } else if (nof_ports == 4) {
DEBUG("Error predecoding multiplex: not implemented for %d Tx ports", nof_ports); DEBUG("Error predecoding multiplex: not implemented for %d Tx ports", nof_ports);
@ -1759,7 +1819,7 @@ int srslte_predecoding_type(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS]
} }
break; break;
case SRSLTE_MIMO_TYPE_SPATIAL_MULTIPLEX: case SRSLTE_MIMO_TYPE_SPATIAL_MULTIPLEX:
return srslte_predecoding_multiplex(y, h, x, nof_rxant, nof_ports, nof_layers, codebook_idx, nof_symbols, return srslte_predecoding_multiplex(y, h, x, csi, nof_rxant, nof_ports, nof_layers, codebook_idx, nof_symbols,
scaling, noise_estimate); scaling, noise_estimate);
default: default:
return SRSLTE_ERROR; return SRSLTE_ERROR;

Loading…
Cancel
Save