From 6c194dc078991be9547da5a547f4742b1fb5aa18 Mon Sep 17 00:00:00 2001 From: ismagom Date: Fri, 16 Oct 2015 11:05:13 +0200 Subject: [PATCH] Conditional SSE compilation --- CMakeLists.txt | 12 +- cmake/modules/FindVolk.cmake | 2 + srslte/include/srslte/mimo/precoding.h | 3 +- srslte/include/srslte/phch/sch.h | 7 +- srslte/include/srslte/srslte.h | 3 +- srslte/include/srslte/utils/vector.h | 2 +- srslte/lib/ch_estimation/test/chest_test_dl.c | 20 +-- .../ch_estimation/test/chest_test_dl_mex.c | 2 +- srslte/lib/dft/src/ofdm.c | 2 +- srslte/lib/fec/src/rm_turbo.c | 160 +++++++++++++++--- srslte/lib/fec/src/turbodecoder_gen.c | 2 +- srslte/lib/fec/src/turbodecoder_sse.c | 17 +- srslte/lib/fec/test/turbodecoder_test.c | 43 +---- srslte/lib/mimo/src/precoding.c | 157 +++++++++++++++-- srslte/lib/mimo/test/precoding_test.c | 3 +- srslte/lib/modem/src/demod_soft.c | 98 +++++------ srslte/lib/phch/src/pbch.c | 3 +- srslte/lib/phch/src/pcfich.c | 3 +- srslte/lib/phch/src/pdcch.c | 2 +- srslte/lib/phch/src/pdsch.c | 3 +- srslte/lib/phch/src/phich.c | 3 +- srslte/lib/phch/src/pusch.c | 3 +- srslte/lib/phch/src/sch.c | 16 +- srslte/lib/phch/test/pdsch_test.c | 10 +- srslte/lib/utils/src/vector.c | 18 +- srslte/lib/utils/src/vector_simd.c | 30 ++-- 26 files changed, 434 insertions(+), 190 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e053d6a0c..85ea56d5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,8 +83,18 @@ IF(CMAKE_COMPILER_IS_GNUCXX) #Any additional flags for CXX ENDIF(CMAKE_COMPILER_IS_GNUCXX) +FIND_PACKAGE(SSE) + IF(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g -mfpmath=sse -mavx -O3") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g -march=native -O3") + IF(AVX_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpmath=sse -mavx -DLV_HAVE_AVX -DLV_HAVE_SSE") + ELSEIF(SSE4_2_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpmath=sse -msse_4.2 -DLV_HAVE_SSE") + ELSEIF(SSE4_1_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpmath=sse -msse_4.1 -DLV_HAVE_SSE") + ENDIF(AVX_FOUND) + # IF(${CMAKE_BUILD_TYPE} STREQUAL "Debug") # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -Wno-error=implicit-function-declaration -Wno-error=unused-but-set-variable") # ENDIF(${CMAKE_BUILD_TYPE} STREQUAL "Debug") diff --git a/cmake/modules/FindVolk.cmake b/cmake/modules/FindVolk.cmake index 3c70329e6..07fb660cd 100644 --- a/cmake/modules/FindVolk.cmake +++ b/cmake/modules/FindVolk.cmake @@ -47,6 +47,7 @@ CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_subtract_32f HAVE_VOLK_SUB_FLOAT_FUNCTION CHECK_FUNCTION_EXISTS_MATH(volk_32fc_x2_square_dist_32f HAVE_VOLK_SQUARE_DIST_FUNCTION) CHECK_FUNCTION_EXISTS_MATH(volk_32fc_deinterleave_real_32f HAVE_VOLK_DEINTERLEAVE_FUNCTION) CHECK_FUNCTION_EXISTS_MATH(volk_32fc_index_max_16u HAVE_VOLK_MAX_ABS_FUNCTION) +CHECK_FUNCTION_EXISTS_MATH(volk_16i_s32f_convert_32f HAVE_VOLK_CONVERT_IF_FUNCTION) INCLUDE(FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS) @@ -54,6 +55,7 @@ MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS VOLK_DEFINITIONS) IF(VOLK_FOUND) SET(CMAKE_REQUIRED_LIBRARIES ${VOLK_LIBRARIES} m) + CHECK_FUNCTION_EXISTS_MATH(volk_16i_s32f_convert_32f HAVE_VOLK_CONVERT_IF_FUNCTION) CHECK_FUNCTION_EXISTS_MATH(volk_32f_index_max_16u HAVE_VOLK_MAX_FUNCTION) CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_max_32f HAVE_VOLK_MAX_VEC_FUNCTION) CHECK_FUNCTION_EXISTS_MATH(volk_32f_accumulator_s32f HAVE_VOLK_ACC_FUNCTION) diff --git a/srslte/include/srslte/mimo/precoding.h b/srslte/include/srslte/mimo/precoding.h index f49899511..cd9e7c48b 100644 --- a/srslte/include/srslte/mimo/precoding.h +++ b/srslte/include/srslte/mimo/precoding.h @@ -85,8 +85,7 @@ SRSLTE_API int srslte_precoding_type(srslte_precoding_t *q, /* Estimates the vector "x" based on the received signal "y" and the channel estimates "h" */ -SRSLTE_API int srslte_predecoding_single(srslte_precoding_t *q, - cf_t *y, +SRSLTE_API int srslte_predecoding_single(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, diff --git a/srslte/include/srslte/phch/sch.h b/srslte/include/srslte/phch/sch.h index 340aeb3be..d31e63a60 100644 --- a/srslte/include/srslte/phch/sch.h +++ b/srslte/include/srslte/phch/sch.h @@ -40,8 +40,7 @@ #include "srslte/common/phy_common.h" #include "srslte/fec/rm_turbo.h" #include "srslte/fec/turbocoder.h" -#include "srslte/fec/turbodecoder_gen.h" -#include "srslte/fec/turbodecoder_sse.h" +#include "srslte/fec/turbodecoder.h" #include "srslte/fec/crc.h" #include "srslte/phch/pdsch_cfg.h" #include "srslte/phch/pusch_cfg.h" @@ -66,12 +65,12 @@ typedef struct SRSLTE_API { uint8_t *parity_bits; void *e; uint8_t *temp_g_bits; - uint32_t *ul_interleaver; + uint16_t *ul_interleaver; srslte_uci_bit_t ack_ri_bits[12*288]; uint32_t nof_ri_ack_bits; srslte_tcod_t encoder; - srslte_tdec_sse_t decoder; + srslte_tdec_t decoder; srslte_crc_t crc_tb; srslte_crc_t crc_cb; diff --git a/srslte/include/srslte/srslte.h b/srslte/include/srslte/srslte.h index 44b57eea7..fcef7490b 100644 --- a/srslte/include/srslte/srslte.h +++ b/srslte/include/srslte/srslte.h @@ -63,8 +63,7 @@ #include "srslte/fec/crc.h" #include "srslte/fec/tc_interl.h" #include "srslte/fec/turbocoder.h" -#include "srslte/fec/turbodecoder_sse.h" -#include "srslte/fec/turbodecoder_gen.h" +#include "srslte/fec/turbodecoder.h" #include "srslte/fec/cbsegm.h" #include "srslte/fec/rm_conv.h" #include "srslte/fec/rm_turbo.h" diff --git a/srslte/include/srslte/utils/vector.h b/srslte/include/srslte/utils/vector.h index 537733961..4ada5fa7f 100644 --- a/srslte/include/srslte/utils/vector.h +++ b/srslte/include/srslte/utils/vector.h @@ -109,7 +109,7 @@ SRSLTE_API void srslte_vec_sc_div2_sss(short *x, int pow2_div, short *z, uint32_ SRSLTE_API void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len); SRSLTE_API void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len); - +SRSLTE_API void srslte_vec_convert_if(int16_t *x, float *z, float scale, uint32_t len); SRSLTE_API void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len); SRSLTE_API void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len); diff --git a/srslte/lib/ch_estimation/test/chest_test_dl.c b/srslte/lib/ch_estimation/test/chest_test_dl.c index 71d10b739..cd900d91e 100644 --- a/srslte/lib/ch_estimation/test/chest_test_dl.c +++ b/srslte/lib/ch_estimation/test/chest_test_dl.c @@ -102,24 +102,24 @@ int main(int argc, char **argv) { num_re = 2 * cell.nof_prb * SRSLTE_NRE * SRSLTE_CP_NSYMB(cell.cp); - input = malloc(num_re * sizeof(cf_t)); + input = srslte_vec_malloc(num_re * sizeof(cf_t)); if (!input) { - perror("malloc"); + perror("srslte_vec_malloc"); goto do_exit; } - output = malloc(num_re * sizeof(cf_t)); + output = srslte_vec_malloc(num_re * sizeof(cf_t)); if (!output) { - perror("malloc"); + perror("srslte_vec_malloc"); goto do_exit; } - h = malloc(num_re * sizeof(cf_t)); + h = srslte_vec_malloc(num_re * sizeof(cf_t)); if (!h) { - perror("malloc"); + perror("srslte_vec_malloc"); goto do_exit; } - ce = malloc(num_re * sizeof(cf_t)); + ce = srslte_vec_malloc(num_re * sizeof(cf_t)); if (!ce) { - perror("malloc"); + perror("srslte_vec_malloc"); goto do_exit; } @@ -173,7 +173,7 @@ int main(int argc, char **argv) { gettimeofday(&t[1], NULL); for (int j=0;j<100;j++) { - srslte_predecoding_single(&cheq, input, ce, output, num_re, 0); + srslte_predecoding_single(input, ce, output, num_re, 0); } gettimeofday(&t[2], NULL); get_time_interval(t); @@ -188,7 +188,7 @@ int main(int argc, char **argv) { gettimeofday(&t[1], NULL); for (int j=0;j<100;j++) { - srslte_predecoding_single(&cheq, input, ce, output, num_re, srslte_chest_dl_get_noise_estimate(&est)); + srslte_predecoding_single(input, ce, output, num_re, srslte_chest_dl_get_noise_estimate(&est)); } gettimeofday(&t[2], NULL); get_time_interval(t); diff --git a/srslte/lib/ch_estimation/test/chest_test_dl_mex.c b/srslte/lib/ch_estimation/test/chest_test_dl_mex.c index c8713c47e..251c65f83 100644 --- a/srslte/lib/ch_estimation/test/chest_test_dl_mex.c +++ b/srslte/lib/ch_estimation/test/chest_test_dl_mex.c @@ -189,7 +189,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) } if (cell.nof_ports == 1) { - srslte_predecoding_single(&cheq, input_signal, ce[0], output_signal2, nof_re, srslte_chest_dl_get_noise_estimate(&chest)); + srslte_predecoding_single(input_signal, ce[0], output_signal2, nof_re, srslte_chest_dl_get_noise_estimate(&chest)); } else { srslte_predecoding_diversity(&cheq, input_signal, ce, output_signal, cell.nof_ports, nof_re, srslte_chest_dl_get_noise_estimate(&chest)); srslte_layerdemap_diversity(output_signal, output_signal2, cell.nof_ports, nof_re/cell.nof_ports); diff --git a/srslte/lib/dft/src/ofdm.c b/srslte/lib/dft/src/ofdm.c index 8f66e7db3..9a0a4e9cb 100644 --- a/srslte/lib/dft/src/ofdm.c +++ b/srslte/lib/dft/src/ofdm.c @@ -181,7 +181,7 @@ void srslte_ofdm_rx_sf(srslte_ofdm_t *q, cf_t *input, cf_t *output) { srslte_vec_prod_ccc(input, q->shift_buffer, input, 2*q->slot_sz); } for (n=0;n<2;n++) { - srslte_ofdm_rx_slot_zerocopy(q, &input[n*q->slot_sz], &output[n*q->nof_re*q->nof_symbols]); + srslte_ofdm_rx_slot(q, &input[n*q->slot_sz], &output[n*q->nof_re*q->nof_symbols]); } } diff --git a/srslte/lib/fec/src/rm_turbo.c b/srslte/lib/fec/src/rm_turbo.c index 5dc4d7086..a108d942c 100644 --- a/srslte/lib/fec/src/rm_turbo.c +++ b/srslte/lib/fec/src/rm_turbo.c @@ -37,14 +37,16 @@ #include "srslte/utils/vector.h" #include "srslte/fec/cbsegm.h" -#define HAVE_SIMD -#ifdef HAVE_SIMD +#ifdef LV_HAVE_SSE #include -#include - -int srslte_rm_turbo_rx_lut_simd(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); +#include +int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); +#endif +#ifdef LV_HAVE_AVX +#include +int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); #endif #define NCOLS 32 @@ -286,29 +288,32 @@ int srslte_rm_turbo_tx_lut(uint8_t *w_buff, uint8_t *systematic, uint8_t *parity int srslte_rm_turbo_rx_lut(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) { -#ifndef HAVE_SIMD - if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) { - uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12; - uint16_t *deinter = deinterleaver[cb_idx][rv_idx]; - - for (int i=0;i= out_len && inputCnt < in_len - 16) { + /* Copy last elements */ + if ((out_len%16) == 12) { + for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) { + output[deinter[j%out_len]] += input[j]; + inputCnt++; + } + } else { + for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) { + output[deinter[j%out_len]] += input[j]; + inputCnt++; + } + } + /* And wrap pointers */ + nwrapps++; + intCnt = 16; + xPtr = (const __m256i*) &input[nwrapps*out_len]; + lutPtr = (const __m256i*) deinter; + } + } + for (int i=inputCnt;i +#ifdef LV_HAVE_SSE #include -#include +#include +#endif + #define NUMSTATES 8 #define NINPUTS 2 @@ -55,11 +58,13 @@ * ************************************************/ +#ifdef LV_HAVE_SSE + static inline int16_t hMax(__m128i buffer) { - __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer); - __m128i tmp3 = _mm_minpos_epu16(tmp1); - return (int16_t)(_mm_cvtsi128_si32(tmp3)); + __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer); + __m128i tmp3 = _mm_minpos_epu16(tmp1); + return (int16_t)(_mm_cvtsi128_si32(tmp3)); } void srslte_map_gen_beta(srslte_map_gen_t * s, int16_t * output, uint32_t long_cb) @@ -626,3 +631,7 @@ int srslte_tdec_sse_run_all(srslte_tdec_sse_t * h, int16_t * input, uint8_t *out return SRSLTE_SUCCESS; } + +#endif + + diff --git a/srslte/lib/fec/test/turbodecoder_test.c b/srslte/lib/fec/test/turbodecoder_test.c index 428f37301..c92d4cb06 100644 --- a/srslte/lib/fec/test/turbodecoder_test.c +++ b/srslte/lib/fec/test/turbodecoder_test.c @@ -118,12 +118,10 @@ int main(int argc, char **argv) { float var[SNR_POINTS]; uint32_t snr_points; uint32_t errors; - uint32_t errors_gen; uint32_t coded_length; struct timeval tdata[3]; - float mean_usec, mean_usec_gen; - srslte_tdec_sse_t tdec; - srslte_tdec_gen_t tdec_gen; + float mean_usec; + srslte_tdec_t tdec; srslte_tcod_t tcod; parse_args(argc, argv); @@ -189,12 +187,7 @@ int main(int argc, char **argv) { exit(-1); } - if (srslte_tdec_sse_init(&tdec, frame_length)) { - fprintf(stderr, "Error initiating Turbo decoder\n"); - exit(-1); - } - - if (srslte_tdec_gen_init(&tdec_gen, frame_length)) { + if (srslte_tdec_init(&tdec, frame_length)) { fprintf(stderr, "Error initiating Turbo decoder\n"); exit(-1); } @@ -216,9 +209,7 @@ int main(int argc, char **argv) { for (i = 0; i < snr_points; i++) { mean_usec = 0; - mean_usec_gen = 0; errors = 0; - errors_gen = 0; frame_cnt = 0; while (frame_cnt < nof_frames) { /* generate data_tx */ @@ -249,8 +240,7 @@ int main(int argc, char **argv) { llr_s[j] = (int16_t) (100*llr[j]); } /* decoder */ - srslte_tdec_sse_reset(&tdec, frame_length); - srslte_tdec_gen_reset(&tdec_gen, frame_length); + srslte_tdec_reset(&tdec, frame_length); uint32_t t; if (nof_iterations == -1) { @@ -261,7 +251,7 @@ int main(int argc, char **argv) { gettimeofday(&tdata[1], NULL); for (int k=0;k +#include +int srslte_predecoding_single_sse(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate); +#endif + +#ifdef LV_HAVE_AVX +#include +int srslte_predecoding_single_avx(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate); +#endif + + /************************************************ * @@ -117,23 +129,138 @@ void srslte_precoding_free(srslte_precoding_t *q) { bzero(q, sizeof(srslte_precoding_t)); } -/* ZF/MMSE SISO equalizer x=y(h'h+no)^(-1)h' (ZF if n0=0.0)*/ -int srslte_predecoding_single(srslte_precoding_t *q, cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) { - if (nof_symbols <= q->max_frame_len) { - // h'h - srslte_vec_abs_square_cf(h, q->y_mod, nof_symbols); - if (noise_estimate > 0.0) { - // (h'h + n0) - srslte_vec_sc_add_fff(q->y_mod, noise_estimate, q->y_mod, nof_symbols); +#ifdef LV_HAVE_SSE + +#define PROD(a,b) _mm_addsub_ps(_mm_mul_ps(a,_mm_moveldup_ps(b)),_mm_mul_ps(_mm_shuffle_ps(a,a,0xB1),_mm_movehdup_ps(b))) + +int srslte_predecoding_single_sse(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) { + + float *xPtr = (float*) x; + const float *hPtr = (const float*) h; + const float *yPtr = (const float*) y; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + __m128 noise = _mm_set1_ps(noise_estimate); + __m128 h1Val, h2Val, y1Val, y2Val, h12square, h1square, h2square, h1conj, h2conj, x1Val, x2Val; + for (int i=0;i 0) { + h12square = _mm_add_ps(h12square, noise); + } + + h1square = _mm_shuffle_ps(h12square, h12square, _MM_SHUFFLE(1, 1, 0, 0)); + h2square = _mm_shuffle_ps(h12square, h12square, _MM_SHUFFLE(3, 3, 2, 2)); + + /* Conjugate channel */ + h1conj = _mm_xor_ps(h1Val, conjugator); + h2conj = _mm_xor_ps(h2Val, conjugator); + + /* Complex product */ + x1Val = PROD(y1Val, h1conj); + x2Val = PROD(y2Val, h2conj); + + x1Val = _mm_div_ps(x1Val, h1square); + x2Val = _mm_div_ps(x2Val, h2square); + + _mm_store_ps(xPtr, x1Val); xPtr+=4; + _mm_store_ps(xPtr, x2Val); xPtr+=4; + } + for (int i=8*(nof_symbols/8);i 0) { + h12square = _mm256_add_ps(h12square, noise); } - // y*h' - srslte_vec_prod_conj_ccc(y, h, x, nof_symbols); - // divide by (h'h+no) - srslte_vec_div_cfc(x,q->y_mod,x,q->z_real,q->z_imag, nof_symbols); - return nof_symbols; + h1_p = _mm256_permute_ps(h12square, _MM_SHUFFLE(1, 1, 0, 0)); + h2_p = _mm256_permute_ps(h12square, _MM_SHUFFLE(3, 3, 2, 2)); + h1square = _mm256_permute2f128_ps(h1_p, h2_p, 2<<4); + h2square = _mm256_permute2f128_ps(h1_p, h2_p, 3<<4 | 1); + + /* Conjugate channel */ + h1conj = _mm256_xor_ps(h1Val, conjugator); + h2conj = _mm256_xor_ps(h2Val, conjugator); + + /* Complex product */ + x1Val = PROD_AVX(y1Val, h1conj); + x2Val = PROD_AVX(y2Val, h2conj); + + x1Val = _mm256_div_ps(x1Val, h1square); + x2Val = _mm256_div_ps(x2Val, h2square); + + _mm256_store_ps(xPtr, x1Val); xPtr+=8; + _mm256_store_ps(xPtr, x2Val); xPtr+=8; + } + for (int i=16*(nof_symbols/16);i 32) { + return srslte_predecoding_single_avx(y, h, x, nof_symbols, noise_estimate); } else { - return SRSLTE_ERROR; + return srslte_predecoding_single_gen(y, h, x, nof_symbols, noise_estimate); } +#else + #ifdef LV_HAVE_SSE + if (nof_symbols > 32) { + return srslte_predecoding_single_sse(y, h, x, nof_symbols, noise_estimate); + } else { + return srslte_predecoding_single_gen(y, h, x, nof_symbols, noise_estimate); + } + #else + return srslte_predecoding_single_gen(y, h, x, nof_symbols, noise_estimate); + #endif +#endif } /* ZF/MMSE STBC equalizer x=y(H'H+n0·I)^(-1)H' (ZF is n0=0.0) @@ -257,7 +384,7 @@ int srslte_predecoding_type(srslte_precoding_t *q, cf_t *y, cf_t *h[SRSLTE_MAX_P switch (type) { case SRSLTE_MIMO_TYPE_SINGLE_ANTENNA: if (nof_ports == 1 && nof_layers == 1) { - return srslte_predecoding_single(q, y, h[0], x[0], nof_symbols, noise_estimate); + return srslte_predecoding_single(y, h[0], x[0], nof_symbols, noise_estimate); } else { fprintf(stderr, "Number of ports and layers must be 1 for transmission on single antenna ports\n"); diff --git a/srslte/lib/mimo/test/precoding_test.c b/srslte/lib/mimo/test/precoding_test.c index a5a27093f..852c747a5 100644 --- a/srslte/lib/mimo/test/precoding_test.c +++ b/srslte/lib/mimo/test/precoding_test.c @@ -102,7 +102,7 @@ int main(int argc, char **argv) { perror("srslte_vec_malloc"); exit(-1); } - xr[i] = calloc(1,sizeof(cf_t) * nof_symbols); + xr[i] = srslte_vec_malloc(sizeof(cf_t) * nof_symbols); if (!xr[i]) { perror("srslte_vec_malloc"); exit(-1); @@ -186,7 +186,6 @@ int main(int argc, char **argv) { mse = 0; for (i = 0; i < nof_layers; i++) { for (j = 0; j < nof_symbols; j++) { - printf("%f - %f\n", crealf(xr[i][j]), crealf(x[i][j])); mse += cabsf(xr[i][j] - x[i][j]); } } diff --git a/srslte/lib/modem/src/demod_soft.c b/srslte/lib/modem/src/demod_soft.c index d299eb802..cec1cf5cd 100644 --- a/srslte/lib/modem/src/demod_soft.c +++ b/srslte/lib/modem/src/demod_soft.c @@ -33,16 +33,16 @@ #include "srslte/utils/bit.h" #include "srslte/modem/demod_soft.h" -#define HAVE_SIMD +// AVX implementation not useful for integers. Wait for AVX2 -#ifdef HAVE_SIMD +#ifdef LV_HAVE_SSE #include +#include #include +void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols); #endif -//#define SCALE_DEMOD16QAM - #define SCALE_SHORT_CONV_QPSK 100 #define SCALE_SHORT_CONV_QAM16 400 #define SCALE_SHORT_CONV_QAM64 700 @@ -72,48 +72,17 @@ void demod_16qam_lte(const cf_t *symbols, float *llr, int nsymbols) { float yre = crealf(symbols[i]); float yim = cimagf(symbols[i]); -#ifdef SCALE_DEMOD16QAM - - llr[4*i+2] = (fabsf(yre)-2/sqrt(10))*sqrt(10); - llr[4*i+3] = (fabsf(yim)-2/sqrt(10))*sqrt(10); - - if (llr[4*i+2] > 0) { - llr[4*i+0] = -yre/(3/sqrt(10)); - } else { - llr[4*i+0] = -yre/(1/sqrt(10)); - } - if (llr[4*i+3] > 0) { - llr[4*i+1] = -yim/(3/sqrt(10)); - } else { - llr[4*i+1] = -yim/(1/sqrt(10)); - } - -#else - llr[4*i+0] = -yre; llr[4*i+1] = -yim; llr[4*i+2] = fabsf(yre)-2/sqrt(10); llr[4*i+3] = fabsf(yim)-2/sqrt(10); - -#endif - } } -void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) { -#ifndef HAVE_SIMD - for (int i=0;iprecoding, q->symbols[0], q->ce[0], q->d, - q->nof_symbols, noise_estimate); + srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, q->nof_symbols, noise_estimate); } else { srslte_predecoding_diversity(&q->precoding, q->symbols[0], q->ce, x, nant, q->nof_symbols, noise_estimate); diff --git a/srslte/lib/phch/src/pcfich.c b/srslte/lib/phch/src/pcfich.c index 317fc6602..d1a27ad92 100644 --- a/srslte/lib/phch/src/pcfich.c +++ b/srslte/lib/phch/src/pcfich.c @@ -193,8 +193,7 @@ int srslte_pcfich_decode(srslte_pcfich_t *q, cf_t *slot_symbols, cf_t *ce[SRSLTE /* in control channels, only diversity is supported */ if (q->cell.nof_ports == 1) { /* no need for layer demapping */ - srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d, - q->nof_symbols, noise_estimate); + srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, q->nof_symbols, noise_estimate); } else { srslte_predecoding_diversity(&q->precoding, q->symbols[0], ce_precoding, x, q->cell.nof_ports, q->nof_symbols, noise_estimate); diff --git a/srslte/lib/phch/src/pdcch.c b/srslte/lib/phch/src/pdcch.c index 7b5755a80..9f97a5013 100644 --- a/srslte/lib/phch/src/pdcch.c +++ b/srslte/lib/phch/src/pdcch.c @@ -408,7 +408,7 @@ int srslte_pdcch_extract_llr(srslte_pdcch_t *q, cf_t *sf_symbols, cf_t *ce[SRSLT /* in control channels, only diversity is supported */ if (q->cell.nof_ports == 1) { /* no need for layer demapping */ - srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d, nof_symbols, noise_estimate); + srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, nof_symbols, noise_estimate); } else { srslte_predecoding_diversity(&q->precoding, q->symbols[0], q->ce, x, q->cell.nof_ports, nof_symbols, noise_estimate); srslte_layerdemap_diversity(x, q->d, q->cell.nof_ports, nof_symbols / q->cell.nof_ports); diff --git a/srslte/lib/phch/src/pdsch.c b/srslte/lib/phch/src/pdsch.c index 17ed53c4f..30b8a9ea6 100644 --- a/srslte/lib/phch/src/pdsch.c +++ b/srslte/lib/phch/src/pdsch.c @@ -404,8 +404,7 @@ int srslte_pdsch_decode_rnti(srslte_pdsch_t *q, /* TODO: only diversity is supported */ if (q->cell.nof_ports == 1) { /* no need for layer demapping */ - srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d, - cfg->nbits.nof_re, noise_estimate); + srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, cfg->nbits.nof_re, noise_estimate); } else { srslte_predecoding_diversity(&q->precoding, q->symbols[0], q->ce, x, q->cell.nof_ports, cfg->nbits.nof_re, noise_estimate); diff --git a/srslte/lib/phch/src/phich.c b/srslte/lib/phch/src/phich.c index 7807f947a..dbc6e8250 100644 --- a/srslte/lib/phch/src/phich.c +++ b/srslte/lib/phch/src/phich.c @@ -216,8 +216,7 @@ int srslte_phich_decode(srslte_phich_t *q, cf_t *slot_symbols, cf_t *ce[SRSLTE_M /* in control channels, only diversity is supported */ if (q->cell.nof_ports == 1) { /* no need for layer demapping */ - srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d0, - SRSLTE_PHICH_MAX_NSYMB, noise_estimate); + srslte_predecoding_single(q->symbols[0], q->ce[0], q->d0, SRSLTE_PHICH_MAX_NSYMB, noise_estimate); } else { srslte_predecoding_diversity(&q->precoding, q->symbols[0], ce_precoding, x, q->cell.nof_ports, SRSLTE_PHICH_MAX_NSYMB, noise_estimate); diff --git a/srslte/lib/phch/src/pusch.c b/srslte/lib/phch/src/pusch.c index 1416cfb78..8e165c490 100644 --- a/srslte/lib/phch/src/pusch.c +++ b/srslte/lib/phch/src/pusch.c @@ -438,8 +438,7 @@ int srslte_pusch_decode(srslte_pusch_t *q, return SRSLTE_ERROR; } - srslte_predecoding_single(&q->equalizer, q->d, q->ce, q->z, - cfg->nbits.nof_re, noise_estimate); + srslte_predecoding_single(q->d, q->ce, q->z, cfg->nbits.nof_re, noise_estimate); srslte_dft_predecoding(&q->dft_precoding, q->z, q->d, cfg->grant.L_prb, cfg->nbits.nof_symb); diff --git a/srslte/lib/phch/src/sch.c b/srslte/lib/phch/src/sch.c index 8a87b36d1..a0a0e44e5 100644 --- a/srslte/lib/phch/src/sch.c +++ b/srslte/lib/phch/src/sch.c @@ -111,7 +111,7 @@ int srslte_sch_init(srslte_sch_t *q) { fprintf(stderr, "Error initiating Turbo Coder\n"); goto clean; } - if (srslte_tdec_sse_init(&q->decoder, SRSLTE_TCOD_MAX_LEN_CB)) { + if (srslte_tdec_init(&q->decoder, SRSLTE_TCOD_MAX_LEN_CB)) { fprintf(stderr, "Error initiating Turbo Decoder\n"); goto clean; } @@ -133,7 +133,7 @@ int srslte_sch_init(srslte_sch_t *q) { goto clean; } bzero(q->temp_g_bits, SRSLTE_MAX_PRB*12*12*12); - q->ul_interleaver = srslte_vec_malloc(sizeof(uint32_t)*SRSLTE_MAX_PRB*12*12*12); + q->ul_interleaver = srslte_vec_malloc(sizeof(uint16_t)*SRSLTE_MAX_PRB*12*12*12); if (!q->ul_interleaver) { goto clean; } @@ -163,7 +163,7 @@ void srslte_sch_free(srslte_sch_t *q) { if (q->ul_interleaver) { free(q->ul_interleaver); } - srslte_tdec_sse_free(&q->decoder); + srslte_tdec_free(&q->decoder); srslte_tcod_free(&q->encoder); srslte_uci_cqi_free(&q->uci_cqi); bzero(q, sizeof(srslte_sch_t)); @@ -413,10 +413,10 @@ static int decode_tb(srslte_sch_t *q, srslte_crc_t *crc_ptr; early_stop = false; - srslte_tdec_sse_reset(&q->decoder, cb_len); + srslte_tdec_reset(&q->decoder, cb_len); do { - srslte_tdec_sse_iteration(&q->decoder, softbuffer->buffer_f[i], cb_len); + srslte_tdec_iteration(&q->decoder, softbuffer->buffer_f[i], cb_len); q->nof_iterations++; if (cb_segm->C > 1) { @@ -427,10 +427,10 @@ static int decode_tb(srslte_sch_t *q, crc_ptr = &q->crc_tb; } - srslte_tdec_sse_decision_byte(&q->decoder, q->cb_in, cb_len); + srslte_tdec_decision_byte(&q->decoder, q->cb_in, cb_len); if (i == 9) { - srslte_tdec_sse_decision(&q->decoder, q->temp_data, cb_len); + srslte_tdec_decision(&q->decoder, q->temp_data, cb_len); } /* Check Codeblock CRC and stop early if incorrect */ if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) { @@ -525,7 +525,7 @@ int srslte_ulsch_decode(srslte_sch_t *q, srslte_pusch_cfg_t *cfg, srslte_softbuf /* UL-SCH channel interleaver according to 5.2.2.8 of 36.212 */ void ulsch_interleave(uint8_t *g_bits, uint32_t Qm, uint32_t H_prime_total, uint32_t N_pusch_symbs, uint8_t *q_bits, srslte_uci_bit_t *ri_bits, uint32_t nof_ri_bits, - uint32_t *interleaver_buffer, uint8_t *temp_buffer, uint32_t buffer_sz) + uint16_t *interleaver_buffer, uint8_t *temp_buffer, uint32_t buffer_sz) { uint32_t rows = H_prime_total/N_pusch_symbs; diff --git a/srslte/lib/phch/test/pdsch_test.c b/srslte/lib/phch/test/pdsch_test.c index bb943ed04..e929143a7 100644 --- a/srslte/lib/phch/test/pdsch_test.c +++ b/srslte/lib/phch/test/pdsch_test.c @@ -147,24 +147,24 @@ int main(int argc, char **argv) { /* init memory */ for (i=0;i #include -#include +#ifdef LV_HAVE_SSE +#include +#include +#endif -void print128_num(__m128i var) -{ - int16_t *val = (int16_t*) &var;//can also use uint32_t instead of 16_t - printf("Numerical: %d %d %d %d %d %d %d %d \n", - val[0], val[1], val[2], val[3], val[4], val[5], - val[6], val[7]); -} void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len) { +#ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; @@ -75,10 +72,13 @@ void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len) for(;number < len; number++){ z[number] = x[number] + y[number]; } +#endif + } void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len) { +#ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; @@ -105,10 +105,12 @@ void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len) for(;number < len; number++){ z[number] = x[number] - y[number]; } +#endif } void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len) { +#ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; @@ -135,10 +137,12 @@ void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len) for(;number < len; number++){ z[number] = x[number] * y[number]; } +#endif } void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len) { +#ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; @@ -163,10 +167,13 @@ void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len) for(;number < len; number++){ z[number] = x[number] / divn; } +#endif } +/* No improvement with AVX */ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t len) { +#ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; @@ -192,12 +199,13 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l for(;number < len; number++){ y[lut[number]] = x[number]; } - +#endif } -/* Modified from volk_32f_s32f_convert_16i_a_sse2. Removed clipping */ +/* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */ void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len) { +#ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int eighthPoints = len / 8; @@ -230,5 +238,5 @@ void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len) for(; number < len; number++){ z[number] = (int16_t) (x[number] * scale); } - +#endif } \ No newline at end of file