From ad06998d91fafa4057770d642e6c01bd20a19644 Mon Sep 17 00:00:00 2001 From: ismagom Date: Sun, 11 Oct 2015 21:37:12 +0200 Subject: [PATCH] Turbo decoder working OK @ 100 Mbps --- CMakeLists.txt | 2 +- srslte/include/srslte/fec/tc_interl.h | 4 +- srslte/include/srslte/fec/turbodecoder.h | 24 +- srslte/include/srslte/fec/turbodecoder_vl.h | 100 ++++ srslte/include/srslte/utils/vector.h | 9 + srslte/include/srslte/utils/vector_simd.h | 50 ++ srslte/lib/fec/src/tc_interl_umts.c | 2 +- srslte/lib/fec/src/turbodecoder.c | 627 ++++++++++++++------ srslte/lib/fec/src/turbodecoder_vl.c | 393 ++++++++++++ srslte/lib/fec/test/turbodecoder_test.c | 157 ++--- srslte/lib/phch/src/sch.c | 1 + srslte/lib/phch/test/pdsch_test.c | 8 +- srslte/lib/utils/src/vector.c | 65 ++ srslte/lib/utils/src/vector_simd.c | 137 +++++ 14 files changed, 1286 insertions(+), 293 deletions(-) create mode 100644 srslte/include/srslte/fec/turbodecoder_vl.h create mode 100644 srslte/include/srslte/utils/vector_simd.h create mode 100644 srslte/lib/fec/src/turbodecoder_vl.c create mode 100644 srslte/lib/utils/src/vector_simd.c diff --git a/CMakeLists.txt b/CMakeLists.txt index fbc2d3727..e053d6a0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,7 +84,7 @@ IF(CMAKE_COMPILER_IS_GNUCXX) ENDIF(CMAKE_COMPILER_IS_GNUCXX) IF(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g -mfpmath=sse -mavx -O3") # IF(${CMAKE_BUILD_TYPE} STREQUAL "Debug") # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -Wno-error=implicit-function-declaration -Wno-error=unused-but-set-variable") # ENDIF(${CMAKE_BUILD_TYPE} STREQUAL "Debug") diff --git a/srslte/include/srslte/fec/tc_interl.h b/srslte/include/srslte/fec/tc_interl.h index 6d38523d6..022e14796 100644 --- a/srslte/include/srslte/fec/tc_interl.h +++ b/srslte/include/srslte/fec/tc_interl.h @@ -40,8 +40,8 @@ #include typedef struct SRSLTE_API { - uint32_t *forward; - uint32_t *reverse; + uint16_t *forward; + uint16_t *reverse; uint32_t max_long_cb; } srslte_tc_interl_t; diff --git a/srslte/include/srslte/fec/turbodecoder.h b/srslte/include/srslte/fec/turbodecoder.h index 14ec74915..6a91b6178 100644 --- a/srslte/include/srslte/fec/turbodecoder.h +++ b/srslte/include/srslte/fec/turbodecoder.h @@ -50,11 +50,12 @@ #define SRSLTE_TCOD_MAX_LEN_CB 6144 #define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL) -typedef float srslte_llr_t; +typedef short llr_t; typedef struct SRSLTE_API { int max_long_cb; - srslte_llr_t *beta; + llr_t *alpha; + llr_t *branch; } srslte_map_gen_t; typedef struct SRSLTE_API { @@ -62,14 +63,17 @@ typedef struct SRSLTE_API { srslte_map_gen_t dec; - srslte_llr_t *llr1; - srslte_llr_t *llr2; - srslte_llr_t *w; - srslte_llr_t *syst; - srslte_llr_t *parity; - + llr_t *app1; + llr_t *app2; + llr_t *ext1; + llr_t *ext2; + llr_t *syst; + llr_t *parity0; + llr_t *parity1; + int current_cbidx; srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES]; + int n_iter; } srslte_tdec_t; SRSLTE_API int srslte_tdec_init(srslte_tdec_t * h, @@ -80,7 +84,7 @@ SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h); SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb); SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h, - srslte_llr_t * input, + float * input, uint32_t long_cb); SRSLTE_API void srslte_tdec_decision(srslte_tdec_t * h, @@ -92,7 +96,7 @@ SRSLTE_API void srslte_tdec_decision_byte(srslte_tdec_t * h, uint32_t long_cb); SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h, - srslte_llr_t * input, + float * input, uint8_t *output, uint32_t nof_iterations, uint32_t long_cb); diff --git a/srslte/include/srslte/fec/turbodecoder_vl.h b/srslte/include/srslte/fec/turbodecoder_vl.h new file mode 100644 index 000000000..47043d9a0 --- /dev/null +++ b/srslte/include/srslte/fec/turbodecoder_vl.h @@ -0,0 +1,100 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 The srsLTE Developers. See the + * COPYRIGHT file at the top-level directory of this distribution. + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +/********************************************************************************************** + * File: turbodecoder.h + * + * Description: Turbo Decoder. + * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent + * encoders and one turbo code internal interleaver. The coding rate of turbo + * encoder is 1/3. + * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. + * + * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 + *********************************************************************************************/ + +#ifndef TURBODECODER_VL_ +#define TURBODECODER_VL_ + +#include "srslte/config.h" +#include "srslte/fec/tc_interl.h" +#include "srslte/fec/cbsegm.h" + +#define SRSLTE_TCOD_RATE 3 +#define SRSLTE_TCOD_TOTALTAIL 12 + +#define SRSLTE_TCOD_MAX_LEN_CB 6144 +#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL) + +typedef float srslte_llr_t; + +typedef struct SRSLTE_API { + int max_long_cb; + srslte_llr_t *beta; +} srslte_map_gen_vl_t; + +typedef struct SRSLTE_API { + int max_long_cb; + + srslte_map_gen_vl_t dec; + + srslte_llr_t *llr1; + srslte_llr_t *llr2; + srslte_llr_t *w; + srslte_llr_t *syst; + srslte_llr_t *parity; + + int current_cbidx; + srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES]; +} srslte_tdec_vl_t; + +SRSLTE_API int srslte_tdec_vl_init(srslte_tdec_vl_t * h, + uint32_t max_long_cb); + +SRSLTE_API void srslte_tdec_vl_free(srslte_tdec_vl_t * h); + +SRSLTE_API int srslte_tdec_vl_reset(srslte_tdec_vl_t * h, uint32_t long_cb); + +SRSLTE_API void srslte_tdec_vl_iteration(srslte_tdec_vl_t * h, + srslte_llr_t * input, + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_vl_decision(srslte_tdec_vl_t * h, + uint8_t *output, + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_vl_decision_byte(srslte_tdec_vl_t * h, + uint8_t *output, + uint32_t long_cb); + +SRSLTE_API int srslte_tdec_vl_run_all(srslte_tdec_vl_t * h, + srslte_llr_t * input, + uint8_t *output, + uint32_t nof_iterations, + uint32_t long_cb); + +#endif diff --git a/srslte/include/srslte/utils/vector.h b/srslte/include/srslte/utils/vector.h index 81cb66bb6..1cde74cd6 100644 --- a/srslte/include/srslte/utils/vector.h +++ b/srslte/include/srslte/utils/vector.h @@ -69,6 +69,7 @@ SRSLTE_API void srslte_vec_fprint_f(FILE *stream, float *x, uint32_t len); SRSLTE_API void srslte_vec_fprint_b(FILE *stream, uint8_t *x, uint32_t len); SRSLTE_API void srslte_vec_fprint_byte(FILE *stream, uint8_t *x, uint32_t len); SRSLTE_API void srslte_vec_fprint_i(FILE *stream, int *x, uint32_t len); +SRSLTE_API void srslte_vec_fprint_s(FILE *stream, short *x, uint32_t len); SRSLTE_API void srslte_vec_fprint_hex(FILE *stream, uint8_t *x, uint32_t len); /* Saves/loads a vector to a file */ @@ -79,6 +80,8 @@ SRSLTE_API void srslte_vec_load_file(char *filename, void *buffer, uint32_t len) SRSLTE_API void srslte_vec_sum_ch(uint8_t *x, uint8_t *y, char *z, uint32_t len); SRSLTE_API void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len); SRSLTE_API void srslte_vec_sum_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len); +SRSLTE_API void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len); /* substract two vectors z=x-y */ SRSLTE_API void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len); @@ -99,12 +102,18 @@ SRSLTE_API void srslte_vec_sc_add_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len); SRSLTE_API void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len); SRSLTE_API void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len); SRSLTE_API void srslte_vec_sc_prod_fff(float *x, float h, float *z, uint32_t len); +SRSLTE_API void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len); +SRSLTE_API void srslte_vec_sc_div2_sss(short *x, int pow2_div, short *z, uint32_t len); /* Normalization */ SRSLTE_API void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len); SRSLTE_API void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len); + +SRSLTE_API void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len); +SRSLTE_API void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len); + SRSLTE_API void srslte_vec_deinterleave_cf(cf_t *x, float *real, float *imag, uint32_t len); SRSLTE_API void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len); diff --git a/srslte/include/srslte/utils/vector_simd.h b/srslte/include/srslte/utils/vector_simd.h new file mode 100644 index 000000000..609ff3413 --- /dev/null +++ b/srslte/include/srslte/utils/vector_simd.h @@ -0,0 +1,50 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 The srsLTE Developers. See the + * COPYRIGHT file at the top-level directory of this distribution. + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#ifndef VECTORSIMD_ +#define VECTORSIMD_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include "srslte/config.h" + + +SRSLTE_API void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len); + +SRSLTE_API void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len); + +SRSLTE_API void srslte_vec_sc_div2_sss_simd(short *x, int n_rightshift, short *z, uint32_t len); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/srslte/lib/fec/src/tc_interl_umts.c b/srslte/lib/fec/src/tc_interl_umts.c index c84b5c7e5..6e47b1114 100644 --- a/srslte/lib/fec/src/tc_interl_umts.c +++ b/srslte/lib/fec/src/tc_interl_umts.c @@ -89,7 +89,7 @@ int srslte_tc_interl_UMTS_gen(srslte_tc_interl_t *h, uint32_t long_cb) { uint32_t i, j; uint32_t res, prim, aux; uint32_t kp, k; - uint32_t *per, *desper; + uint16_t *per, *desper; uint8_t v; uint16_t p; uint16_t s[MAX_COLS], q[MAX_ROWS], r[MAX_ROWS], T[MAX_ROWS]; diff --git a/srslte/lib/fec/src/turbodecoder.c b/srslte/lib/fec/src/turbodecoder.c index c702df6e9..6e3bd9bf7 100644 --- a/srslte/lib/fec/src/turbodecoder.c +++ b/srslte/lib/fec/src/turbodecoder.c @@ -35,139 +35,272 @@ #include "srslte/fec/turbodecoder.h" #include "srslte/utils/vector.h" +#include + +#include +#include + #define NUMSTATES 8 #define NINPUTS 2 #define TAIL 3 #define TOTALTAIL 12 -#define INF 9e4 -#define ZERO 9e-4 +#define INF 10000 +#define ZERO 0 +#define SCALE 100 + +static void print128_num(__m128i var) +{ + int16_t *val = (int16_t*) &var;//can also use uint32_t instead of 16_t + printf("[%d %d %d %d %d %d %d %d]\n", + val[0], val[1], val[2], val[3], val[4], val[5], + val[6], val[7]); +} + +void print128f_num(__m128 var) +{ + float *val = (float*) &var; + printf("[%f %f %f %f]\n", + val[0], val[1], val[2], val[3]); +} + /************************************************ * - * MAP_GEN is the MAX-LOG-MAP generic implementation of the - * Decoder + * MAP_GEN is the MAX-LOG-MAP generic implementation * ************************************************/ -void srslte_map_gen_beta(srslte_map_gen_t * s, srslte_llr_t * input, srslte_llr_t * parity, - uint32_t long_cb) + +static inline int16_t hMax(__m128i buffer) { - srslte_llr_t m_b[8], new[8], old[8]; - srslte_llr_t x, y, xy; - int k; - uint32_t end = long_cb + SRSLTE_TCOD_RATE; - srslte_llr_t *beta = s->beta; - uint32_t i; + __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer); + __m128i tmp3 = _mm_minpos_epu16(tmp1); + return (int16_t)(_mm_cvtsi128_si32(tmp3)); +} - for (i = 0; i < 8; i++) { - old[i] = beta[8 * (end) + i]; - } - - for (k = end - 1; k >= 0; k--) { - x = input[k]; - y = parity[k]; - - xy = x + y; - - m_b[0] = old[4] + xy; - m_b[1] = old[4]; - m_b[2] = old[5] + y; - m_b[3] = old[5] + x; - m_b[4] = old[6] + x; - m_b[5] = old[6] + y; - m_b[6] = old[7]; - m_b[7] = old[7] + xy; - - new[0] = old[0]; - new[1] = old[0] + xy; - new[2] = old[1] + x; - new[3] = old[1] + y; - new[4] = old[2] + y; - new[5] = old[2] + x; - new[6] = old[3] + xy; - new[7] = old[3]; - - for (i = 0; i < 8; i++) { - if (m_b[i] > new[i]) - new[i] = m_b[i]; - beta[8 * k + i] = new[i]; - old[i] = new[i]; - } - } +void srslte_map_gen_beta(srslte_map_gen_t * s, llr_t * output, uint32_t long_cb) +{ + int k; + uint32_t end = long_cb + 3; + const __m128i *alphaPtr = (const __m128i*) s->alpha; + + __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); + __m128i g, bp, bn, alpha_k; + + __m128i shuf_bp = _mm_set_epi8( + 15, 14, // 7 + 7, 6, // 3 + 5, 4, // 2 + 13, 12, // 6 + 11, 10, // 5 + 3, 2, // 1 + 1, 0, // 0 + 9, 8 // 4 + ); + + __m128i shuf_bn = _mm_set_epi8( + 7, 6, // 3 + 15, 14, // 7 + 13, 12, // 6 + 5, 4, // 2 + 3, 2, // 1 + 11, 10, // 5 + 9, 8, // 4 + 1, 0 // 0 + ); + + alphaPtr += long_cb-1; + + __m128i shuf_g[4]; + shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); + shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); + shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); + shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); + __m128i gv; + llr_t *b = &s->branch[2*long_cb-8]; + __m128i *gPtr = (__m128i*) b; + __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); + +#define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\ + bn = _mm_sub_epi16(beta_k, g);\ + bp = _mm_shuffle_epi8(bp, shuf_bp);\ + bn = _mm_shuffle_epi8(bn, shuf_bn);\ + beta_k = _mm_max_epi16(bp, bn); + +#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\ + BETA_STEP(g)\ + alpha_k = _mm_load_si128(alphaPtr);\ + alphaPtr--;\ + bp = _mm_add_epi16(bp, alpha_k);\ + bn = _mm_add_epi16(bn, alpha_k); output[k-d] = hMax(bn) - hMax(bp); + + for (k=end-1; k>=long_cb; k--) { + llr_t g0 = s->branch[2*k]; + llr_t g1 = s->branch[2*k+1]; + g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1); + + BETA_STEP(g); + } + + for (; k >= 0; k-=8) { + gv = _mm_load_si128(gPtr); + gPtr--; + BETA_STEP_CNT(0,0); + BETA_STEP_CNT(1,1); + BETA_STEP_CNT(2,2); + BETA_STEP_CNT(3,3); + gv = _mm_load_si128(gPtr); + gPtr--; + BETA_STEP_CNT(0,4); + BETA_STEP_CNT(1,5); + BETA_STEP_CNT(2,6); + BETA_STEP_CNT(3,7); + __m128i norm = _mm_shuffle_epi8(beta_k, shuf_norm); + beta_k = _mm_sub_epi16(beta_k, norm); + } } -void srslte_map_gen_alpha(srslte_map_gen_t * s, srslte_llr_t * input, srslte_llr_t * parity, srslte_llr_t * output, - uint32_t long_cb) +void srslte_map_gen_alpha(srslte_map_gen_t * s, uint32_t long_cb) { - srslte_llr_t m_b[8], new[8], old[8], max1[8], max0[8]; - srslte_llr_t m1, m0; - srslte_llr_t x, y, xy; - srslte_llr_t out; uint32_t k; - uint32_t end = long_cb; - srslte_llr_t *beta = s->beta; + llr_t *alpha = s->alpha; uint32_t i; - old[0] = 0; + alpha[0] = 0; for (i = 1; i < 8; i++) { - old[i] = -INF; - } - - for (k = 1; k < end + 1; k++) { - x = input[k - 1]; - y = parity[k - 1]; - - xy = x + y; - - m_b[0] = old[0]; - m_b[1] = old[3] + y; - m_b[2] = old[4] + y; - m_b[3] = old[7]; - m_b[4] = old[1]; - m_b[5] = old[2] + y; - m_b[6] = old[5] + y; - m_b[7] = old[6]; - - new[0] = old[1] + xy; - new[1] = old[2] + x; - new[2] = old[5] + x; - new[3] = old[6] + xy; - new[4] = old[0] + xy; - new[5] = old[3] + x; - new[6] = old[4] + x; - new[7] = old[7] + xy; - - for (i = 0; i < 8; i++) { - max0[i] = m_b[i] + beta[8 * k + i]; - max1[i] = new[i] + beta[8 * k + i]; - } + alpha[i] = -INF; + } + + __m128i shuf_ap = _mm_set_epi8( + 15, 14, // 7 + 9, 8, // 4 + 7, 6, // 3 + 1, 0, // 0 + 13, 12, // 6 + 11, 10, // 5 + 5, 4, // 2 + 3, 2 // 1 + ); + + __m128i shuf_an = _mm_set_epi8( + 13, 12, // 6 + 11, 10, // 5 + 5, 4, // 2 + 3, 2, // 1 + 15, 14, // 7 + 9, 8, // 4 + 7, 6, // 3 + 1, 0 // 0 + ); + + __m128i shuf_g[4]; + shuf_g[0] = _mm_set_epi8(3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2); + shuf_g[1] = _mm_set_epi8(7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6); + shuf_g[2] = _mm_set_epi8(11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10); + shuf_g[3] = _mm_set_epi8(15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14); - m1 = max1[0]; - m0 = max0[0]; + __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); + + __m128i* alphaPtr = (__m128i*) alpha; + alphaPtr++; - for (i = 1; i < 8; i++) { - if (max1[i] > m1) - m1 = max1[i]; - if (max0[i] > m0) - m0 = max0[i]; - } + __m128i gv; + __m128i *gPtr = (__m128i*) s->branch; + __m128i g, ap, an; + + __m128i alpha_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); + +#define ALPHA_STEP(c) g = _mm_shuffle_epi8(gv, shuf_g[c]); \ + ap = _mm_add_epi16(alpha_k, g);\ + an = _mm_sub_epi16(alpha_k, g);\ + ap = _mm_shuffle_epi8(ap, shuf_ap);\ + an = _mm_shuffle_epi8(an, shuf_an);\ + alpha_k = _mm_max_epi16(ap, an);\ + _mm_store_si128(alphaPtr, alpha_k);\ + alphaPtr++; \ + + for (k = 0; k < long_cb/8; k++) { + gv = _mm_load_si128(gPtr); + gPtr++; + ALPHA_STEP(0); + ALPHA_STEP(1); + ALPHA_STEP(2); + ALPHA_STEP(3); + gv = _mm_load_si128(gPtr); + gPtr++; + ALPHA_STEP(0); + ALPHA_STEP(1); + ALPHA_STEP(2); + ALPHA_STEP(3); + __m128i norm = _mm_shuffle_epi8(alpha_k, shuf_norm); + alpha_k = _mm_sub_epi16(alpha_k, norm); + } +} + +void srslte_map_gen_gamma(srslte_map_gen_t * h, llr_t *input, llr_t *app, llr_t *parity, uint32_t long_cb) +{ + __m128i res10, res20, res11, res21, res1, res2; + __m128i in, ap, pa, g1, g0; - for (i = 0; i < 8; i++) { - if (m_b[i] > new[i]) - new[i] = m_b[i]; - old[i] = new[i]; + __m128i *inPtr = (__m128i*) input; + __m128i *appPtr = (__m128i*) app; + __m128i *paPtr = (__m128i*) parity; + __m128i *resPtr = (__m128i*) h->branch; + + __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); + __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); + __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); + __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); + + for (int i=0;ibranch[2*i] = (input[i] - parity[i])/2; + h->branch[2*i+1] = (input[i] + parity[i])/2; } } + int srslte_map_gen_init(srslte_map_gen_t * h, int max_long_cb) { bzero(h, sizeof(srslte_map_gen_t)); - h->beta = srslte_vec_malloc(sizeof(srslte_llr_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); - if (!h->beta) { + h->alpha = srslte_vec_malloc(sizeof(llr_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); + if (!h->alpha) { + perror("srslte_vec_malloc"); + return -1; + } + h->branch = srslte_vec_malloc(sizeof(llr_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); + if (!h->branch) { perror("srslte_vec_malloc"); return -1; } @@ -177,23 +310,28 @@ int srslte_map_gen_init(srslte_map_gen_t * h, int max_long_cb) void srslte_map_gen_free(srslte_map_gen_t * h) { - if (h->beta) { - free(h->beta); + if (h->alpha) { + free(h->alpha); + } + if (h->branch) { + free(h->branch); } bzero(h, sizeof(srslte_map_gen_t)); } -void srslte_map_gen_dec(srslte_map_gen_t * h, srslte_llr_t * input, srslte_llr_t * parity, srslte_llr_t * output, +void srslte_map_gen_dec(srslte_map_gen_t * h, llr_t * input, llr_t *app, llr_t * parity, llr_t * output, uint32_t long_cb) { - uint32_t k; + + // Compute branch metrics + srslte_map_gen_gamma(h, input, app, parity, long_cb); - h->beta[(long_cb + TAIL) * NUMSTATES] = 0; - for (k = 1; k < NUMSTATES; k++) - h->beta[(long_cb + TAIL) * NUMSTATES + k] = -INF; + // Forward recursion + srslte_map_gen_alpha(h, long_cb); - srslte_map_gen_beta(h, input, parity, long_cb); - srslte_map_gen_alpha(h, input, parity, output, long_cb); + // Backwards recursion + LLR computation + srslte_map_gen_beta(h, output, long_cb); + } /************************************************ @@ -209,28 +347,38 @@ int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) h->max_long_cb = max_long_cb; - h->llr1 = srslte_vec_malloc(sizeof(srslte_llr_t) * len); - if (!h->llr1) { + h->app1 = srslte_vec_malloc(sizeof(llr_t) * len); + if (!h->app1) { perror("srslte_vec_malloc"); goto clean_and_exit; } - h->llr2 = srslte_vec_malloc(sizeof(srslte_llr_t) * len); - if (!h->llr2) { + h->app2 = srslte_vec_malloc(sizeof(llr_t) * len); + if (!h->app2) { perror("srslte_vec_malloc"); goto clean_and_exit; } - h->w = srslte_vec_malloc(sizeof(srslte_llr_t) * len); - if (!h->w) { + h->ext1 = srslte_vec_malloc(sizeof(llr_t) * len); + if (!h->ext1) { perror("srslte_vec_malloc"); goto clean_and_exit; } - h->syst = srslte_vec_malloc(sizeof(srslte_llr_t) * len); + h->ext2 = srslte_vec_malloc(sizeof(llr_t) * len); + if (!h->ext2) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->syst = srslte_vec_malloc(sizeof(llr_t) * len); if (!h->syst) { perror("srslte_vec_malloc"); goto clean_and_exit; } - h->parity = srslte_vec_malloc(sizeof(srslte_llr_t) * len); - if (!h->parity) { + h->parity0 = srslte_vec_malloc(sizeof(llr_t) * len); + if (!h->parity0) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->parity1 = srslte_vec_malloc(sizeof(llr_t) * len); + if (!h->parity1) { perror("srslte_vec_malloc"); goto clean_and_exit; } @@ -255,20 +403,26 @@ clean_and_exit:if (ret == -1) { void srslte_tdec_free(srslte_tdec_t * h) { - if (h->llr1) { - free(h->llr1); + if (h->app1) { + free(h->app1); } - if (h->llr2) { - free(h->llr2); + if (h->app2) { + free(h->app2); } - if (h->w) { - free(h->w); + if (h->ext1) { + free(h->ext1); + } + if (h->ext2) { + free(h->ext2); } if (h->syst) { free(h->syst); } - if (h->parity) { - free(h->parity); + if (h->parity0) { + free(h->parity0); + } + if (h->parity1) { + free(h->parity1); } srslte_map_gen_free(&h->dec); @@ -280,48 +434,145 @@ void srslte_tdec_free(srslte_tdec_t * h) bzero(h, sizeof(srslte_tdec_t)); } -void srslte_tdec_iteration(srslte_tdec_t * h, srslte_llr_t * input, uint32_t long_cb) -{ +void deinterleave_input(srslte_tdec_t *h, float *input, uint32_t long_cb) { uint32_t i; + + float *inputPtr = input; + __m128 inf0, inf1, inf2, inf3, inf4, inf5; + __m128i in0, in1, in2; + __m128i s0, s1, s2, s; + __m128i p00, p01, p02, p0; + __m128i p10, p11, p12, p1; + + __m128i *sysPtr = (__m128i*) h->syst; + __m128i *pa0Ptr = (__m128i*) h->parity0; + __m128i *pa1Ptr = (__m128i*) h->parity1; + + // pick bits 0, 3, 6 from 1st word + __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0); + // pick bits 1, 4, 7 from 2st word + __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff); + // pick bits 2, 5 from 3rd word + __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + // pick bits 1, 4, 7 from 1st word + __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2); + // pick bits 2, 5, from 2st word + __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff); + // pick bits 0, 3, 6 from 3rd word + __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + // pick bits 2, 5 from 1st word + __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4); + // pick bits 0, 3, 6, from 2st word + __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff); + // pick bits 1, 4, 7 from 3rd word + __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + __m128 vScalar = _mm_set1_ps(SCALE); + + // Split systematic and parity bits + for (i = 0; i < long_cb/8; i++) { + + inf0 = _mm_load_ps(inputPtr); inputPtr+=4; + inf1 = _mm_load_ps(inputPtr); inputPtr+=4; + inf2 = _mm_load_ps(inputPtr); inputPtr+=4; + inf3 = _mm_load_ps(inputPtr); inputPtr+=4; + inf4 = _mm_load_ps(inputPtr); inputPtr+=4; + inf5 = _mm_load_ps(inputPtr); inputPtr+=4; + + inf0 = _mm_mul_ps(inf0, vScalar); + inf1 = _mm_mul_ps(inf1, vScalar); + inf2 = _mm_mul_ps(inf2, vScalar); + inf3 = _mm_mul_ps(inf3, vScalar); + inf4 = _mm_mul_ps(inf4, vScalar); + inf5 = _mm_mul_ps(inf5, vScalar); + + in0 = _mm_packs_epi32(_mm_cvtps_epi32(inf0), _mm_cvtps_epi32(inf1)); + in1 = _mm_packs_epi32(_mm_cvtps_epi32(inf2), _mm_cvtps_epi32(inf3)); + in2 = _mm_packs_epi32(_mm_cvtps_epi32(inf4), _mm_cvtps_epi32(inf5)); + + /* Deinterleave Systematic bits */ + s0 = _mm_shuffle_epi8(in0, s0_mask); + s1 = _mm_shuffle_epi8(in1, s1_mask); + s2 = _mm_shuffle_epi8(in2, s2_mask); + s = _mm_or_si128(s0, s1); + s = _mm_or_si128(s, s2); + + _mm_store_si128(sysPtr, s); + sysPtr++; + + /* Deinterleave parity 0 bits */ + p00 = _mm_shuffle_epi8(in0, p00_mask); + p01 = _mm_shuffle_epi8(in1, p01_mask); + p02 = _mm_shuffle_epi8(in2, p02_mask); + p0 = _mm_or_si128(p00, p01); + p0 = _mm_or_si128(p0, p02); + + _mm_store_si128(pa0Ptr, p0); + pa0Ptr++; - if (h->current_cbidx >= 0) { + /* Deinterleave parity 1 bits */ + p10 = _mm_shuffle_epi8(in0, p10_mask); + p11 = _mm_shuffle_epi8(in1, p11_mask); + p12 = _mm_shuffle_epi8(in2, p12_mask); + p1 = _mm_or_si128(p10, p11); + p1 = _mm_or_si128(p1, p12); + + _mm_store_si128(pa1Ptr, p1); + pa1Ptr++; + + } + + for (i = 0; i < 3; i++) { + h->syst[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 2*i]; + h->parity0[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 2*i + 1]; + } + for (i = 0; i < 3; i++) { + h->app2[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 6 + 2*i]; + h->parity1[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 6 + 2*i + 1]; + } - uint32_t *inter = h->interleaver[h->current_cbidx].forward; - uint32_t *deinter = h->interleaver[h->current_cbidx].reverse; +} + +void srslte_tdec_iteration(srslte_tdec_t * h, float * input, uint32_t long_cb) +{ + + if (h->current_cbidx >= 0) { + uint16_t *inter = h->interleaver[h->current_cbidx].forward; + uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; - // Prepare systematic and parity bits for MAP DEC #1 - for (i = 0; i < long_cb; i++) { - h->syst[i] = input[SRSLTE_TCOD_RATE * i] + h->w[i]; - h->parity[i] = input[SRSLTE_TCOD_RATE * i + 1]; + if (h->n_iter == 0) { + deinterleave_input(h, input, long_cb); } - for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) { - h->syst[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb)]; - h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb) + 1]; + + // Add apriori information to decoder 1 + if (h->n_iter > 0) { + srslte_vec_sub_sss(h->app1, h->ext1, h->app1, long_cb); } - + // Run MAP DEC #1 - srslte_map_gen_dec(&h->dec, h->syst, h->parity, h->llr1, long_cb); - - // Prepare systematic and parity bits for MAP DEC #1 - for (i = 0; i < long_cb; i++) { - h->syst[i] = h->llr1[inter[i]] - - h->w[inter[i]]; - h->parity[i] = input[SRSLTE_TCOD_RATE * i + 2]; - } - for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) { - h->syst[i] = - input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + NINPUTS * (i - long_cb)]; - h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE - + NINPUTS * (i - long_cb) + 1]; + if (h->n_iter == 0) { + srslte_map_gen_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, long_cb); + } else { + srslte_map_gen_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, long_cb); } - // Run MAP DEC #1 - srslte_map_gen_dec(&h->dec, h->syst, h->parity, h->llr2, long_cb); - - // Update a-priori LLR from the last iteration - for (i = 0; i < long_cb; i++) { - h->w[i] += h->llr2[deinter[i]] - h->llr1[i]; + // Convert aposteriori information into extrinsic information + if (h->n_iter > 0) { + srslte_vec_sub_sss(h->ext1, h->app1, h->ext1, long_cb); } + + // Interleave extrinsic output of DEC1 to form apriori info for decoder 2 + srslte_vec_lut_sss(h->ext1, inter, h->app2, long_cb); + + // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits + srslte_map_gen_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, long_cb); + + // Deinterleaved extrinsic bits become apriori info for decoder 1 + srslte_vec_lut_sss(h->ext2, deinter, h->app1, long_cb); + + h->n_iter++; } else { fprintf(stderr, "Error CB index not set (call srslte_tdec_reset() first\n"); } @@ -334,7 +585,7 @@ int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) h->max_long_cb); return -1; } - memset(h->w, 0, sizeof(srslte_llr_t) * long_cb); + h->n_iter = 0; h->current_cbidx = srslte_cbsegm_cbindex(long_cb); if (h->current_cbidx < 0) { fprintf(stderr, "Invalid CB length %d\n", long_cb); @@ -345,47 +596,59 @@ int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { - uint32_t *deinter = h->interleaver[h->current_cbidx].reverse; - uint32_t i; - for (i = 0; i < long_cb; i++) { - output[i] = (h->llr2[deinter[i]] > 0) ? 1 : 0; + __m128i zero = _mm_set1_epi16(0); + __m128i lsb_mask = _mm_set1_epi16(1); + + __m128i *appPtr = (__m128i*) h->app1; + __m128i *outPtr = (__m128i*) output; + __m128i ap, out, out0, out1; + + for (uint32_t i = 0; i < long_cb/16; i++) { + ap = _mm_load_si128(appPtr); appPtr++; + out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); + ap = _mm_load_si128(appPtr); appPtr++; + out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); + + out = _mm_packs_epi16(out0, out1); + _mm_store_si128(outPtr, out); + outPtr++; + } + if (long_cb%16) { + for (int i=0;i<8;i++) { + output[long_cb-8+i] = h->app1[long_cb-8+i]>0?1:0; + } } } void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { - uint32_t i; uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; - uint32_t *deinter = h->interleaver[h->current_cbidx].reverse; // long_cb is always byte aligned - for (i = 0; i < long_cb/8; i++) { - uint8_t out0 = h->llr2[deinter[8*i+0]]>0?mask[0]:0; - uint8_t out1 = h->llr2[deinter[8*i+1]]>0?mask[1]:0; - uint8_t out2 = h->llr2[deinter[8*i+2]]>0?mask[2]:0; - uint8_t out3 = h->llr2[deinter[8*i+3]]>0?mask[3]:0; - uint8_t out4 = h->llr2[deinter[8*i+4]]>0?mask[4]:0; - uint8_t out5 = h->llr2[deinter[8*i+5]]>0?mask[5]:0; - uint8_t out6 = h->llr2[deinter[8*i+6]]>0?mask[6]:0; - uint8_t out7 = h->llr2[deinter[8*i+7]]>0?mask[7]:0; + for (uint32_t i = 0; i < long_cb/8; i++) { + uint8_t out0 = h->app1[i+0]>0?mask[0]:0; + uint8_t out1 = h->app1[i+1]>0?mask[1]:0; + uint8_t out2 = h->app1[i+2]>0?mask[2]:0; + uint8_t out3 = h->app1[i+3]>0?mask[3]:0; + uint8_t out4 = h->app1[i+4]>0?mask[4]:0; + uint8_t out5 = h->app1[i+5]>0?mask[5]:0; + uint8_t out6 = h->app1[i+6]>0?mask[6]:0; + uint8_t out7 = h->app1[i+7]>0?mask[7]:0; output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; } } -int srslte_tdec_run_all(srslte_tdec_t * h, srslte_llr_t * input, uint8_t *output, +int srslte_tdec_run_all(srslte_tdec_t * h, float * input, uint8_t *output, uint32_t nof_iterations, uint32_t long_cb) { - uint32_t iter = 0; - if (srslte_tdec_reset(h, long_cb)) { return SRSLTE_ERROR; } do { srslte_tdec_iteration(h, input, long_cb); - iter++; - } while (iter < nof_iterations); + } while (h->n_iter < nof_iterations); srslte_tdec_decision(h, output, long_cb); diff --git a/srslte/lib/fec/src/turbodecoder_vl.c b/srslte/lib/fec/src/turbodecoder_vl.c new file mode 100644 index 000000000..fe3817080 --- /dev/null +++ b/srslte/lib/fec/src/turbodecoder_vl.c @@ -0,0 +1,393 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 The srsLTE Developers. See the + * COPYRIGHT file at the top-level directory of this distribution. + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "srslte/fec/turbodecoder_vl.h" +#include "srslte/utils/vector.h" + +#define NUMSTATES 8 +#define NINPUTS 2 +#define TAIL 3 +#define TOTALTAIL 12 + +#define INF 9e4 +#define ZERO 9e-4 + +/************************************************ + * + * MAP_GEN is the MAX-LOG-MAP generic implementation of the + * Decoder + * + ************************************************/ +static void map_gen_beta(srslte_map_gen_vl_t * s, srslte_llr_t * input, srslte_llr_t * parity, + uint32_t long_cb) +{ + srslte_llr_t m_b[8], new[8], old[8]; + srslte_llr_t x, y, xy; + int k; + uint32_t end = long_cb + SRSLTE_TCOD_RATE; + srslte_llr_t *beta = s->beta; + uint32_t i; + + for (i = 0; i < 8; i++) { + old[i] = beta[8 * (end) + i]; + } + + for (k = end - 1; k >= 0; k--) { + x = input[k]; + y = parity[k]; + + xy = x + y; + + m_b[0] = old[4] + xy; + m_b[1] = old[4]; + m_b[2] = old[5] + y; + m_b[3] = old[5] + x; + m_b[4] = old[6] + x; + m_b[5] = old[6] + y; + m_b[6] = old[7]; + m_b[7] = old[7] + xy; + + new[0] = old[0]; + new[1] = old[0] + xy; + new[2] = old[1] + x; + new[3] = old[1] + y; + new[4] = old[2] + y; + new[5] = old[2] + x; + new[6] = old[3] + xy; + new[7] = old[3]; + + for (i = 0; i < 8; i++) { + if (m_b[i] > new[i]) + new[i] = m_b[i]; + old[i] = new[i]; + beta[8 * k + i] = old[i]; + } + } +} + +static void map_gen_alpha(srslte_map_gen_vl_t * s, srslte_llr_t * input, srslte_llr_t * parity, srslte_llr_t * output, + uint32_t long_cb) +{ + srslte_llr_t m_b[8], new[8], old[8], max1[8], max0[8]; + srslte_llr_t m1, m0; + srslte_llr_t x, y, xy; + srslte_llr_t out; + uint32_t k; + uint32_t end = long_cb; + srslte_llr_t *beta = s->beta; + uint32_t i; + + old[0] = 0; + for (i = 1; i < 8; i++) { + old[i] = -INF; + } + + for (k = 1; k < end + 1; k++) { + x = input[k - 1]; + y = parity[k - 1]; + + xy = x + y; + + m_b[0] = old[0]; + m_b[1] = old[3] + y; + m_b[2] = old[4] + y; + m_b[3] = old[7]; + m_b[4] = old[1]; + m_b[5] = old[2] + y; + m_b[6] = old[5] + y; + m_b[7] = old[6]; + + new[0] = old[1] + xy; + new[1] = old[2] + x; + new[2] = old[5] + x; + new[3] = old[6] + xy; + new[4] = old[0] + xy; + new[5] = old[3] + x; + new[6] = old[4] + x; + new[7] = old[7] + xy; + + for (i = 0; i < 8; i++) { + max0[i] = m_b[i] + beta[8 * k + i]; + max1[i] = new[i] + beta[8 * k + i]; + } + + m1 = max1[0]; + m0 = max0[0]; + + for (i = 1; i < 8; i++) { + if (max1[i] > m1) + m1 = max1[i]; + if (max0[i] > m0) + m0 = max0[i]; + } + + for (i = 0; i < 8; i++) { + if (m_b[i] > new[i]) + new[i] = m_b[i]; + old[i] = new[i]; + } + + out = m1 - m0; + output[k - 1] = out; + } +} + +static int map_gen_init(srslte_map_gen_vl_t * h, int max_long_cb) +{ + bzero(h, sizeof(srslte_map_gen_vl_t)); + h->beta = srslte_vec_malloc(sizeof(srslte_llr_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); + if (!h->beta) { + perror("srslte_vec_malloc"); + return -1; + } + h->max_long_cb = max_long_cb; + return 0; +} + +static void map_gen_free(srslte_map_gen_vl_t * h) +{ + if (h->beta) { + free(h->beta); + } + bzero(h, sizeof(srslte_map_gen_vl_t)); +} + +static void map_gen_dec(srslte_map_gen_vl_t * h, srslte_llr_t * input, srslte_llr_t * parity, srslte_llr_t * output, + uint32_t long_cb) +{ + uint32_t k; + + h->beta[(long_cb + TAIL) * NUMSTATES] = 0; + for (k = 1; k < NUMSTATES; k++) + h->beta[(long_cb + TAIL) * NUMSTATES + k] = -INF; + + map_gen_beta(h, input, parity, long_cb); + map_gen_alpha(h, input, parity, output, long_cb); +} + +/************************************************ + * + * TURBO DECODER INTERFACE + * + ************************************************/ +int srslte_tdec_vl_init(srslte_tdec_vl_t * h, uint32_t max_long_cb) +{ + int ret = -1; + bzero(h, sizeof(srslte_tdec_vl_t)); + uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL; + + h->max_long_cb = max_long_cb; + + h->llr1 = srslte_vec_malloc(sizeof(srslte_llr_t) * len); + if (!h->llr1) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->llr2 = srslte_vec_malloc(sizeof(srslte_llr_t) * len); + if (!h->llr2) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->w = srslte_vec_malloc(sizeof(srslte_llr_t) * len); + if (!h->w) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->syst = srslte_vec_malloc(sizeof(srslte_llr_t) * len); + if (!h->syst) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->parity = srslte_vec_malloc(sizeof(srslte_llr_t) * len); + if (!h->parity) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + + if (map_gen_init(&h->dec, h->max_long_cb)) { + goto clean_and_exit; + } + + for (int i=0;iinterleaver[i], srslte_cbsegm_cbsize(i)) < 0) { + goto clean_and_exit; + } + srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i)); + } + h->current_cbidx = -1; + ret = 0; +clean_and_exit:if (ret == -1) { + srslte_tdec_vl_free(h); + } + return ret; +} + +void srslte_tdec_vl_free(srslte_tdec_vl_t * h) +{ + if (h->llr1) { + free(h->llr1); + } + if (h->llr2) { + free(h->llr2); + } + if (h->w) { + free(h->w); + } + if (h->syst) { + free(h->syst); + } + if (h->parity) { + free(h->parity); + } + + map_gen_free(&h->dec); + + for (int i=0;iinterleaver[i]); + } + + bzero(h, sizeof(srslte_tdec_vl_t)); +} + +void srslte_tdec_vl_iteration(srslte_tdec_vl_t * h, srslte_llr_t * input, uint32_t long_cb) +{ + uint32_t i; + + if (h->current_cbidx >= 0) { + + uint16_t *inter = h->interleaver[h->current_cbidx].forward; + uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; + + // Prepare systematic and parity bits for MAP DEC #1 + for (i = 0; i < long_cb; i++) { + h->syst[i] = input[SRSLTE_TCOD_RATE * i] + h->w[i]; + h->parity[i] = input[SRSLTE_TCOD_RATE * i + 1]; + } + for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) { + h->syst[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb)]; + h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb) + 1]; + } + + // Run MAP DEC #1 + map_gen_dec(&h->dec, h->syst, h->parity, h->llr1, long_cb); + + // Prepare systematic and parity bits for MAP DEC #1 + for (i = 0; i < long_cb; i++) { + h->syst[i] = h->llr1[inter[i]] + - h->w[inter[i]]; + h->parity[i] = input[SRSLTE_TCOD_RATE * i + 2]; + } + for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) { + h->syst[i] = + input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + NINPUTS * (i - long_cb)]; + h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + + NINPUTS * (i - long_cb) + 1]; + } + + // Run MAP DEC #2 + map_gen_dec(&h->dec, h->syst, h->parity, h->llr2, long_cb); + + // Update a-priori LLR from the last iteration + for (i = 0; i < long_cb; i++) { + h->w[i] += h->llr2[deinter[i]] - h->llr1[i]; + } + } else { + fprintf(stderr, "Error CB index not set (call srslte_tdec_vl_reset() first\n"); + } +} + +int srslte_tdec_vl_reset(srslte_tdec_vl_t * h, uint32_t long_cb) +{ + if (long_cb > h->max_long_cb) { + fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n", + h->max_long_cb); + return -1; + } + memset(h->w, 0, sizeof(srslte_llr_t) * long_cb); + h->current_cbidx = srslte_cbsegm_cbindex(long_cb); + if (h->current_cbidx < 0) { + fprintf(stderr, "Invalid CB length %d\n", long_cb); + return -1; + } + return 0; +} + +void srslte_tdec_vl_decision(srslte_tdec_vl_t * h, uint8_t *output, uint32_t long_cb) +{ + uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; + uint32_t i; + for (i = 0; i < long_cb; i++) { + output[i] = (h->llr2[deinter[i]] > 0) ? 1 : 0; + } +} + +void srslte_tdec_vl_decision_byte(srslte_tdec_vl_t * h, uint8_t *output, uint32_t long_cb) +{ + uint32_t i; + uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; + uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; + + // long_cb is always byte aligned + for (i = 0; i < long_cb/8; i++) { + uint8_t out0 = h->llr2[deinter[8*i+0]]>0?mask[0]:0; + uint8_t out1 = h->llr2[deinter[8*i+1]]>0?mask[1]:0; + uint8_t out2 = h->llr2[deinter[8*i+2]]>0?mask[2]:0; + uint8_t out3 = h->llr2[deinter[8*i+3]]>0?mask[3]:0; + uint8_t out4 = h->llr2[deinter[8*i+4]]>0?mask[4]:0; + uint8_t out5 = h->llr2[deinter[8*i+5]]>0?mask[5]:0; + uint8_t out6 = h->llr2[deinter[8*i+6]]>0?mask[6]:0; + uint8_t out7 = h->llr2[deinter[8*i+7]]>0?mask[7]:0; + + output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; + } +} + +int srslte_tdec_vl_run_all(srslte_tdec_vl_t * h, srslte_llr_t * input, uint8_t *output, + uint32_t nof_iterations, uint32_t long_cb) +{ + uint32_t iter = 0; + + if (srslte_tdec_vl_reset(h, long_cb)) { + return SRSLTE_ERROR; + } + + do { + srslte_tdec_vl_iteration(h, input, long_cb); + iter++; + } while (iter < nof_iterations); + + srslte_tdec_vl_decision(h, output, long_cb); + + return SRSLTE_SUCCESS; +} diff --git a/srslte/lib/fec/test/turbodecoder_test.c b/srslte/lib/fec/test/turbodecoder_test.c index a482e1e94..dff4d729e 100644 --- a/srslte/lib/fec/test/turbodecoder_test.c +++ b/srslte/lib/fec/test/turbodecoder_test.c @@ -36,6 +36,7 @@ #include #include #include "srslte/srslte.h" +#include "srslte/fec/turbodecoder_vl.h" #include "turbodecoder_test.h" @@ -46,14 +47,15 @@ float ebno_db = 100.0; uint32_t seed = 0; int K = -1; -#define MAX_ITERATIONS 4 +#define MAX_ITERATIONS 10 int nof_iterations = MAX_ITERATIONS; int test_known_data = 0; int test_errors = 0; +int nof_repetitions = 1; -#define SNR_POINTS 8 -#define SNR_MIN 0.0 -#define SNR_MAX 4.0 +#define SNR_POINTS 4 +#define SNR_MIN 1.0 +#define SNR_MAX 8.0 void usage(char *prog) { printf("Usage: %s [nlesv]\n", prog); @@ -61,6 +63,7 @@ void usage(char *prog) { "\t-k Test with known data (ignores frame_length) [Default disabled]\n"); printf("\t-i nof_iterations [Default %d]\n", nof_iterations); printf("\t-n nof_frames [Default %d]\n", nof_frames); + printf("\t-N nof_repetitions [Default %d]\n", nof_repetitions); printf("\t-l frame_length [Default %d]\n", frame_length); printf("\t-e ebno in dB [Default scan]\n"); printf("\t-t test: check errors on exit [Default disabled]\n"); @@ -69,11 +72,14 @@ void usage(char *prog) { void parse_args(int argc, char **argv) { int opt; - while ((opt = getopt(argc, argv, "inlstvekt")) != -1) { + while ((opt = getopt(argc, argv, "inNlstvekt")) != -1) { switch (opt) { case 'n': nof_frames = atoi(argv[optind]); break; + case 'N': + nof_repetitions = atoi(argv[optind]); + break; case 'k': test_known_data = 1; break; @@ -102,29 +108,6 @@ void parse_args(int argc, char **argv) { } } -void output_matlab(float ber[MAX_ITERATIONS][SNR_POINTS], int snr_points) { - int i, j; - FILE *f = fopen("turbocoder_snr.m", "w"); - if (!f) { - perror("fopen"); - exit(-1); - } - fprintf(f, "ber=["); - for (j = 0; j < MAX_ITERATIONS; j++) { - for (i = 0; i < snr_points; i++) { - fprintf(f, "%g ", ber[j][i]); - } - fprintf(f, ";\n"); - } - fprintf(f, "];\n"); - fprintf(f, "snr=linspace(%g,%g-%g/%d,%d);\n", SNR_MIN, SNR_MAX, SNR_MAX, - snr_points, snr_points); - fprintf(f, "semilogy(snr,ber,snr,0.5*erfc(sqrt(10.^(snr/10))));\n"); - fprintf(f, - "legend('1 iter','2 iter', '3 iter', '4 iter', 'theory-uncoded');"); - fprintf(f, "grid on;\n"); - fclose(f); -} int main(int argc, char **argv) { uint32_t frame_cnt; @@ -134,14 +117,15 @@ int main(int argc, char **argv) { uint32_t i, j; float var[SNR_POINTS]; uint32_t snr_points; - float ber[MAX_ITERATIONS][SNR_POINTS]; - uint32_t errors[100]; + uint32_t errors; + uint32_t errors_vl; uint32_t coded_length; struct timeval tdata[3]; - float mean_usec; + float mean_usec, mean_usec_vl; srslte_tdec_t tdec; + srslte_tdec_vl_t tdec_vl; srslte_tcod_t tcod; - + parse_args(argc, argv); if (!seed) { @@ -200,6 +184,11 @@ int main(int argc, char **argv) { exit(-1); } + if (srslte_tdec_vl_init(&tdec_vl, frame_length)) { + fprintf(stderr, "Error initiating Turbo decoder\n"); + exit(-1); + } + float ebno_inc, esno_db; ebno_inc = (SNR_MAX - SNR_MIN) / SNR_POINTS; if (ebno_db == 100.0) { @@ -215,11 +204,13 @@ int main(int argc, char **argv) { snr_points = 1; } for (i = 0; i < snr_points; i++) { + mean_usec = 0; + mean_usec_vl = 0; + errors = 0; + errors_vl = 0; frame_cnt = 0; - bzero(errors, sizeof(int) * MAX_ITERATIONS); while (frame_cnt < nof_frames) { - /* generate data_tx */ for (j = 0; j < frame_length; j++) { if (test_known_data) { @@ -239,13 +230,14 @@ int main(int argc, char **argv) { } for (j = 0; j < coded_length; j++) { - llr[j] = symbols[j] ? sqrt(2) : -sqrt(2); + llr[j] = symbols[j] ? 1 : -1; } srslte_ch_awgn_f(llr, llr, var[i], coded_length); - + /* decoder */ srslte_tdec_reset(&tdec, frame_length); + srslte_tdec_vl_reset(&tdec_vl, frame_length); uint32_t t; if (nof_iterations == -1) { @@ -253,68 +245,50 @@ int main(int argc, char **argv) { } else { t = nof_iterations; } - for (j = 0; j < t; j++) { - - if (!j) - gettimeofday(&tdata[1], NULL); // Only measure 1 iteration - srslte_tdec_iteration(&tdec, llr, frame_length); - srslte_tdec_decision(&tdec, data_rx, frame_length); - if (!j) - gettimeofday(&tdata[2], NULL); - if (!j) - get_time_interval(tdata); - if (!j) - mean_usec = (float) mean_usec * 0.9 + (float) tdata[0].tv_usec * 0.1; - - /* check errors */ - errors[j] += srslte_bit_diff(data_tx, data_rx, frame_length); - if (j < MAX_ITERATIONS) { - ber[j][i] = (float) errors[j] / (frame_cnt * frame_length); - } + + gettimeofday(&tdata[1], NULL); + for (int k=0;k known_data_errors[j]) { - fprintf(stderr, "Expected %d errors but got %d\n", - known_data_errors[j], errors[j]); - exit(-1); - } else { - printf("Iter %d ok\n", j + 1); - } - } - } else { - for (j = 0; j < MAX_ITERATIONS; j++) { - printf("BER: %g\t%u errors\n", - (float) errors[j] / (frame_cnt * frame_length), errors[j]); - if (test_errors) { - if (errors[j] - > get_expected_errors(frame_cnt, seed, j + 1, frame_length, - ebno_db)) { - fprintf(stderr, "Expected %d errors but got %d\n", - get_expected_errors(frame_cnt, seed, j + 1, frame_length, - ebno_db), errors[j]); - exit(-1); - } else { - printf("Iter %d ok\n", j + 1); - } - } - } - } + printf("\n"); + if (snr_points == 1) { + if (errors) { + printf("%d Errors\n", errors); } - } + if (errors_vl) { + printf("%d Errors in VL\n", errors_vl); + } + } + free(data_tx); free(symbols); @@ -326,7 +300,6 @@ int main(int argc, char **argv) { srslte_tcod_free(&tcod); printf("\n"); - output_matlab(ber, snr_points); printf("Done\n"); exit(0); } diff --git a/srslte/lib/phch/src/sch.c b/srslte/lib/phch/src/sch.c index 5d9bda3a7..6fa6a566b 100644 --- a/srslte/lib/phch/src/sch.c +++ b/srslte/lib/phch/src/sch.c @@ -441,6 +441,7 @@ static int decode_tb(srslte_sch_t *q, early_stop = true; } + } while (q->nof_iterations < SRSLTE_PDSCH_MAX_TDEC_ITERS && !early_stop); q->average_nof_iterations = SRSLTE_VEC_EMA((float) q->nof_iterations, q->average_nof_iterations, 0.2); diff --git a/srslte/lib/phch/test/pdsch_test.c b/srslte/lib/phch/test/pdsch_test.c index 6c4a90c94..017a43861 100644 --- a/srslte/lib/phch/test/pdsch_test.c +++ b/srslte/lib/phch/test/pdsch_test.c @@ -228,14 +228,12 @@ int main(int argc, char **argv) { int r = srslte_pdsch_decode(&pdsch, &pdsch_cfg, &softbuffer_rx, slot_symbols[0], ce, 0, data); gettimeofday(&t[2], NULL); get_time_interval(t); + printf("DECODED %d in %d:%d (%.2f Mbps)\n", r?"Error":"OK", + (int) t[0].tv_sec, (int) t[0].tv_usec, (float) grant.mcs.tbs/t[0].tv_usec); if (r) { - printf("Error decoding TBS: %d\n", grant.mcs.tbs); ret = -1; goto quit; - } else { - printf("DECODED OK in %d:%d (%.2f Mbps)\n", - (int) t[0].tv_sec, (int) t[0].tv_usec, (float) grant.mcs.tbs/t[0].tv_usec); - } + } } ret = 0; quit: diff --git a/srslte/lib/utils/src/vector.c b/srslte/lib/utils/src/vector.c index 834d1ca40..bb87c8df9 100644 --- a/srslte/lib/utils/src/vector.c +++ b/srslte/lib/utils/src/vector.c @@ -33,8 +33,11 @@ #include #include "srslte/utils/vector.h" +#include "srslte/utils/vector_simd.h" #include "srslte/utils/bit.h" +#define HAVE_VECTOR_SIMD + #ifdef HAVE_VOLK #include "volk/volk.h" #endif @@ -102,6 +105,17 @@ void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) { #endif } +void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) { +#ifndef HAVE_VECTOR_SIMD + int i; + for (i=0;i +#include +#include +#include +#include + +#include "srslte/utils/vector_simd.h" + +#include +#include + +#include + +void print128_num(__m128i var) +{ + int16_t *val = (int16_t*) &var;//can also use uint32_t instead of 16_t + printf("Numerical: %d %d %d %d %d %d %d %d \n", + val[0], val[1], val[2], val[3], val[4], val[5], + val[6], val[7]); +} + +void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len) +{ + unsigned int number = 0; + const unsigned int points = len / 8; + + const __m128i* xPtr = (const __m128i*) x; + const __m128i* yPtr = (const __m128i*) y; + __m128i* zPtr = (__m128i*) z; + + __m128i xVal, yVal, zVal; + for(;number < points; number++){ + + xVal = _mm_load_si128(xPtr); + yVal = _mm_load_si128(yPtr); + + zVal = _mm_add_epi16(xVal, yVal); + + _mm_store_si128(zPtr, zVal); + + xPtr ++; + yPtr ++; + zPtr ++; + } + + number = points * 8; + for(;number < len; number++){ + z[number] = x[number] + y[number]; + } +} + +void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len) +{ + unsigned int number = 0; + const unsigned int points = len / 8; + + const __m128i* xPtr = (const __m128i*) x; + const __m128i* yPtr = (const __m128i*) y; + __m128i* zPtr = (__m128i*) z; + + __m128i xVal, yVal, zVal; + for(;number < points; number++){ + + xVal = _mm_load_si128(xPtr); + yVal = _mm_load_si128(yPtr); + + zVal = _mm_sub_epi16(xVal, yVal); + + _mm_store_si128(zPtr, zVal); + + xPtr ++; + yPtr ++; + zPtr ++; + } + + number = points * 8; + for(;number < len; number++){ + z[number] = x[number] - y[number]; + } +} + +void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len) +{ + unsigned int number = 0; + const unsigned int points = len / 8; + + const __m128i* xPtr = (const __m128i*) x; + __m128i* zPtr = (__m128i*) z; + + __m128i xVal, zVal; + for(;number < points; number++){ + + xVal = _mm_load_si128(xPtr); + + zVal = _mm_srai_epi16(xVal, k); + + _mm_store_si128(zPtr, zVal); + + xPtr ++; + zPtr ++; + } + + number = points * 8; + short divn = (1<