diff --git a/lib/include/srslte/phy/common/phy_common.h b/lib/include/srslte/phy/common/phy_common.h index bfb2a993a..efd498c86 100644 --- a/lib/include/srslte/phy/common/phy_common.h +++ b/lib/include/srslte/phy/common/phy_common.h @@ -53,6 +53,8 @@ #define SRSLTE_MAX_LAYERS 4 #define SRSLTE_MAX_CODEWORDS 2 +#define SRSLTE_MAX_CODEBLOCKS 32 + #define SRSLTE_LTE_CRC24A 0x1864CFB #define SRSLTE_LTE_CRC24B 0X1800063 #define SRSLTE_LTE_CRC16 0x11021 diff --git a/lib/include/srslte/phy/fec/turbodecoder.h b/lib/include/srslte/phy/fec/turbodecoder.h index 149841fad..24e38d09e 100644 --- a/lib/include/srslte/phy/fec/turbodecoder.h +++ b/lib/include/srslte/phy/fec/turbodecoder.h @@ -52,12 +52,14 @@ #include "srslte/phy/fec/turbodecoder_gen.h" #ifdef LV_HAVE_SSE -#include "srslte/phy/fec/turbodecoder_sse.h" +#include "srslte/phy/fec/turbodecoder_simd.h" +#else +#define SRSLTE_TDEC_NPAR 1 #endif typedef struct SRSLTE_API { #ifdef LV_HAVE_SSE - srslte_tdec_sse_t tdec_sse; + srslte_tdec_simd_t tdec_simd; #else float *input_conv; srslte_tdec_gen_t tdec_gen; @@ -69,7 +71,16 @@ SRSLTE_API int srslte_tdec_init(srslte_tdec_t * h, SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h); -SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb); +SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, + uint32_t long_cb); + +SRSLTE_API int srslte_tdec_reset_cb(srslte_tdec_t * h, + uint32_t cb_idx); + +SRSLTE_API int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h, + uint32_t cb_idx); + +SRSLTE_API int srslte_tdec_get_nof_parallel(srslte_tdec_t * h); SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, @@ -89,4 +100,31 @@ SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h, uint32_t nof_iterations, uint32_t long_cb); +SRSLTE_API void srslte_tdec_iteration_par(srslte_tdec_t * h, + int16_t* input[SRSLTE_TDEC_NPAR], + uint32_t nof_cb, + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_decision_par(srslte_tdec_t * h, + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t nof_cb, + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_decision_byte_par(srslte_tdec_t * h, + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t nof_cb, + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_decision_byte_par_cb(srslte_tdec_t * h, + uint8_t *output, + uint32_t cb_idx, + uint32_t long_cb); + +SRSLTE_API int srslte_tdec_run_all_par(srslte_tdec_t * h, + int16_t * input[SRSLTE_TDEC_NPAR], + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t nof_iterations, + uint32_t nof_cb, + uint32_t long_cb); + #endif diff --git a/lib/include/srslte/phy/fec/turbodecoder_gen.h b/lib/include/srslte/phy/fec/turbodecoder_gen.h index 7f219201a..4a023396c 100644 --- a/lib/include/srslte/phy/fec/turbodecoder_gen.h +++ b/lib/include/srslte/phy/fec/turbodecoder_gen.h @@ -66,6 +66,8 @@ typedef struct SRSLTE_API { float *parity; int current_cbidx; + uint32_t current_cb_len; + uint32_t n_iter; srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES]; } srslte_tdec_gen_t; diff --git a/lib/include/srslte/phy/fec/turbodecoder_simd_inter.h b/lib/include/srslte/phy/fec/turbodecoder_simd_inter.h new file mode 100644 index 000000000..402faa314 --- /dev/null +++ b/lib/include/srslte/phy/fec/turbodecoder_simd_inter.h @@ -0,0 +1,119 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +/********************************************************************************************** + * File: turbodecoder.h + * + * Description: Turbo Decoder. + * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent + * encoders and one turbo code internal interleaver. The coding rate of turbo + * encoder is 1/3. + * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. + * + * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 + *********************************************************************************************/ + +#ifndef TURBODECODER_SSE_INTER_ +#define TURBODECODER_SSE_INTER_ + + +/** This is an simd inter-frame parallel turbo decoder. Parallizes 8 code-blocks using SSE + * This implementation is currently not functional and not used by the rest of the code + */ + +#include "srslte/config.h" +#include "srslte/phy/fec/tc_interl.h" +#include "srslte/phy/fec/cbsegm.h" + +#if LV_HAVE_AVX2 + #define SRSLTE_TDEC_NPAR 16 +#else + #define SRSLTE_TDEC_NPAR 8 +#endif + + +typedef struct SRSLTE_API { + int max_long_cb; + + int16_t *syst0; + int16_t *parity0; + int16_t *syst1; + int16_t *parity1; + int16_t *llr1; + int16_t *llr2; + int16_t *w; + int16_t *alpha; + + uint32_t max_par_cb; + int current_cbidx; + uint32_t current_long_cb; + srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES]; + int n_iter[SRSLTE_TDEC_NPAR]; +} srslte_tdec_simd_inter_t; + +SRSLTE_API int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h, + uint32_t max_par_cb, + uint32_t max_long_cb); + +SRSLTE_API void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h); + +SRSLTE_API int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h, + uint32_t long_cb); + +SRSLTE_API int srslte_tdec_simd_inter_get_nof_iterations_cb(srslte_tdec_simd_inter_t * h, + uint32_t cb_idx); + +SRSLTE_API int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h, + uint32_t cb_idx); + +SRSLTE_API void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h, + int16_t * input[SRSLTE_TDEC_NPAR], + uint32_t nof_cb, + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h, + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t nof_cb, + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h, + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t nof_cb, + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h, + uint8_t *output, + uint32_t cbidx, + uint32_t long_cb); + +SRSLTE_API int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h, + int16_t *input[SRSLTE_TDEC_NPAR], + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t nof_iterations, + uint32_t nof_cb, + uint32_t long_cb); + +#endif diff --git a/lib/src/phy/fec/test/turbodecoder_test.c b/lib/src/phy/fec/test/turbodecoder_test.c index dad9a07c8..839bc4202 100644 --- a/lib/src/phy/fec/test/turbodecoder_test.c +++ b/lib/src/phy/fec/test/turbodecoder_test.c @@ -257,13 +257,10 @@ int main(int argc, char **argv) { int16_t *input[SRSLTE_TDEC_NPAR]; uint8_t *output[SRSLTE_TDEC_NPAR]; - input[0] = llr_s; - if (SRSLTE_TDEC_NPAR == 2) - input[1] = llr_s; - - output[0] = data_rx_bytes[0]; - if (SRSLTE_TDEC_NPAR == 2) - output[1] = data_rx_bytes[1]; + for (int n=0;ntdec_simd, max_long_cb); + return srslte_tdec_simd_init(&h->tdec_simd, SRSLTE_TDEC_NPAR, max_long_cb); #else h->input_conv = srslte_vec_malloc(sizeof(float) * (3*max_long_cb+12)); if (!h->input_conv) { @@ -56,7 +56,7 @@ int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) { void srslte_tdec_free(srslte_tdec_t * h) { #ifdef LV_HAVE_SSE - srslte_tdec_simd_free(&h->tdec_simd); + srslte_tdec_simd_free(&h->tdec_simd); #else if (h->input_conv) { free(h->input_conv); @@ -74,9 +74,26 @@ int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) { #endif } +int srslte_tdec_reset_cb(srslte_tdec_t * h, uint32_t cb_idx) { +#ifdef LV_HAVE_SSE + return srslte_tdec_simd_reset_cb(&h->tdec_simd, cb_idx); +#else + return srslte_tdec_gen_reset(&h->tdec_gen, h->tdec_gen.current_cb_len); +#endif +} + +int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h, uint32_t cb_idx) +{ +#ifdef LV_HAVE_SSE + return srslte_tdec_simd_get_nof_iterations_cb(&h->tdec_simd, cb_idx); +#else + return h->tdec_gen.n_iter; +#endif +} + void srslte_tdec_iteration_par(srslte_tdec_t * h, int16_t* input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) { #ifdef LV_HAVE_SSE - srslte_tdec_simd_iteration(&h->tdec_simd, input, nof_cb, long_cb); + srslte_tdec_simd_iteration(&h->tdec_simd, input, nof_cb, long_cb); #else srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12); srslte_tdec_gen_iteration(&h->tdec_gen, h->input_conv, long_cb); @@ -105,12 +122,20 @@ void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) void srslte_tdec_decision_byte_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) { #ifdef LV_HAVE_SSE - srslte_tdec_simd_decision_byte(&h->tdec_simd, output, nof_cb, long_cb); + srslte_tdec_simd_decision_byte(&h->tdec_simd, output, nof_cb, long_cb); #else srslte_tdec_gen_decision_byte(&h->tdec_gen, output[0], long_cb); #endif } +void srslte_tdec_decision_byte_par_cb(srslte_tdec_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb) { +#ifdef LV_HAVE_SSE + srslte_tdec_simd_decision_byte_cb(&h->tdec_simd, output, cb_idx, long_cb); +#else + srslte_tdec_gen_decision_byte(&h->tdec_gen, output, long_cb); +#endif +} + void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { uint8_t *output_par[SRSLTE_TDEC_NPAR]; output_par[0] = output; @@ -121,7 +146,7 @@ int srslte_tdec_run_all_par(srslte_tdec_t * h, int16_t * input[SRSLTE_TDEC_NPAR] uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb) { #ifdef LV_HAVE_SSE - return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, nof_cb, long_cb); + return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, nof_cb, long_cb); #else srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12); return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output[0], nof_iterations, long_cb); diff --git a/lib/src/phy/fec/turbodecoder_avx.c b/lib/src/phy/fec/turbodecoder_avx.c index 82cceb5d2..a98279d25 100644 --- a/lib/src/phy/fec/turbodecoder_avx.c +++ b/lib/src/phy/fec/turbodecoder_avx.c @@ -153,8 +153,6 @@ void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t lo __m256i gv; int16_t *b = &s->branch[2*NCB*long_cb-16]; __m256i *gPtr = (__m256i*) b; - - __m256i bn2, bp2; /* This defines a beta computation step: * Adds and substracts the branch metrics to the previous beta step, @@ -175,10 +173,10 @@ void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t lo alphaPtr--;\ bp = _mm256_add_epi16(bp, alpha_k);\ bn = _mm256_add_epi16(bn, alpha_k);\ - bn2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bn);\ - bp2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bp);\ - output[0][k-d] = hMax0(bn2) - hMax0(bp2);\ - output[1][k-d] = hMax1(bn2) - hMax1(bp2); + bn = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bn);\ + bp = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bp);\ + output[0][k-d] = hMax0(bn) - hMax0(bp);\ + output[1][k-d] = hMax1(bn) - hMax1(bp); /* The tail does not require to load alpha or produce outputs. Only update * beta metrics accordingly */ @@ -309,7 +307,7 @@ void map_avx_alpha(map_gen_t * s, uint32_t long_cb) an = _mm256_shuffle_epi8(an, shuf_an);\ alpha_k = _mm256_max_epi16(ap, an);\ _mm256_store_si256(alphaPtr, alpha_k);\ - alphaPtr++; \ + alphaPtr++;\ /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */ @@ -335,15 +333,62 @@ void map_avx_alpha(map_gen_t * s, uint32_t long_cb) } } -/* Compute branch metrics (gamma) */ -void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb) +void map_sse_gamma_single(int16_t *output, int16_t *input, int16_t *app, int16_t *parity) { - __m128i res10, res20, res11, res21, res1, res2; + __m128i res00, res10, res01, res11, res0, res1; __m128i in, ap, pa, g1, g0; __m128i *inPtr = (__m128i*) input; __m128i *appPtr = (__m128i*) app; __m128i *paPtr = (__m128i*) parity; + __m128i *resPtr = (__m128i*) output; + + __m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); + __m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); + __m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); + __m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); + + in = _mm_load_si128(inPtr); + inPtr++; + pa = _mm_load_si128(paPtr); + paPtr++; + + if (appPtr) { + ap = _mm_load_si128(appPtr); + appPtr++; + in = _mm_add_epi16(ap, in); + } + + g1 = _mm_add_epi16(in, pa); + g0 = _mm_sub_epi16(in, pa); + + g1 = _mm_srai_epi16(g1, 1); + g0 = _mm_srai_epi16(g0, 1); + + res00 = _mm_shuffle_epi8(g0, res00_mask); + res10 = _mm_shuffle_epi8(g0, res10_mask); + res01 = _mm_shuffle_epi8(g1, res01_mask); + res11 = _mm_shuffle_epi8(g1, res11_mask); + + res0 = _mm_or_si128(res00, res01); + res1 = _mm_or_si128(res10, res11); + + _mm_store_si128(resPtr, res0); + resPtr++; + _mm_store_si128(resPtr, res1); + resPtr++; +} + + +/* Compute branch metrics (gamma) */ +void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb) +{ + __m128i res10, res20, res11, res21, res1, res2; + __m256i in, ap, pa, g1, g0; + + __m256i *inPtr = (__m256i*) input; + __m256i *appPtr = (__m256i*) app; + __m256i *paPtr = (__m256i*) parity; __m128i *resPtr = (__m128i*) h->branch; if (cbidx) { @@ -351,32 +396,56 @@ void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, } __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); - __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); + + __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); - for (int i=0;ialpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR); + + h->max_par_cb = max_par_cb; + h->max_long_cb = max_long_cb; + + h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb); if (!h->alpha) { perror("srslte_vec_malloc"); return -1; } - h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR); + h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb); if (!h->branch) { perror("srslte_vec_malloc"); return -1; } - h->max_long_cb = max_long_cb; return 0; } @@ -142,15 +145,16 @@ void map_simd_dec(map_gen_t * h, int16_t * input[SRSLTE_TDEC_NPAR], int16_t *app } /* Initializes the turbo decoder object */ -int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_long_cb) +int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_par_cb, uint32_t max_long_cb) { int ret = -1; bzero(h, sizeof(srslte_tdec_simd_t)); uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL; h->max_long_cb = max_long_cb; - - for (int i=0;imax_par_cb = max_par_cb; + + for (int i=0;imax_par_cb;i++) { h->app1[i] = srslte_vec_malloc(sizeof(int16_t) * len); if (!h->app1[i]) { perror("srslte_vec_malloc"); @@ -189,7 +193,7 @@ int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_long_cb) } - if (map_simd_init(&h->dec, h->max_long_cb)) { + if (map_simd_init(&h->dec, h->max_par_cb, h->max_long_cb)) { goto clean_and_exit; } @@ -209,7 +213,7 @@ clean_and_exit:if (ret == -1) { void srslte_tdec_simd_free(srslte_tdec_simd_t * h) { - for (int i=0;imax_par_cb;i++) { if (h->app1[i]) { free(h->app1[i]); } @@ -333,33 +337,34 @@ void deinterleave_input_simd(srslte_tdec_simd_t *h, int16_t *input, uint32_t cbi void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) { + int16_t *tmp_app[SRSLTE_TDEC_NPAR]; + if (h->current_cbidx >= 0) { uint16_t *inter = h->interleaver[h->current_cbidx].forward; uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; - if (h->n_iter == 0) { - for (int i=0;in_iter[i] == 0) { deinterleave_input_simd(h, input[i], i, long_cb); } } // Add apriori information to decoder 1 - if (h->n_iter > 0) { - for (int i=0;in_iter[i] > 0) { srslte_vec_sub_sss(h->app1[i], h->ext1[i], h->app1[i], long_cb); } } // Run MAP DEC #1 - if (h->n_iter == 0) { - map_simd_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, nof_cb, long_cb); - } else { - map_simd_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, nof_cb, long_cb); + for (int i=0;imax_par_cb;i++) { + tmp_app[i] = h->n_iter[i]?h->app1[i]:NULL; } + map_simd_dec(&h->dec, h->syst, tmp_app, h->parity0, h->ext1, nof_cb, long_cb); // Convert aposteriori information into extrinsic information - if (h->n_iter > 0) { - for (int i=0;in_iter[i] > 0) { srslte_vec_sub_sss(h->ext1[i], h->app1[i], h->ext1[i], long_cb); } } @@ -377,7 +382,9 @@ void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_T srslte_vec_lut_sss(h->ext2[i], inter, h->app1[i], long_cb); } - h->n_iter++; + for (int i=0;imax_par_cb;i++) { + h->n_iter[i]++; + } } else { fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_reset() first\n"); } @@ -391,7 +398,9 @@ int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb) h->max_long_cb); return -1; } - h->n_iter = 0; + for (int i=0;imax_par_cb;i++) { + h->n_iter[i] = 0; + } h->current_cbidx = srslte_cbsegm_cbindex(long_cb); if (h->current_cbidx < 0) { fprintf(stderr, "Invalid CB length %d\n", long_cb); @@ -400,6 +409,17 @@ int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb) return 0; } +int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h, uint32_t cb_idx) +{ + h->n_iter[cb_idx] = 0; + return 0; +} + +int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h, uint32_t cb_idx) +{ + return h->n_iter[cb_idx]; +} + void tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb) { __m128i zero = _mm_set1_epi16(0); @@ -433,7 +453,7 @@ void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TD } } -void tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb) +void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb) { uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; @@ -449,17 +469,13 @@ void tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output, uint32_t c uint8_t out7 = h->app1[cbidx][8*i+7]>0?mask[7]:0; output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; - - //if (i<10) { - // printf("output[%d]=%d\n",i,output[i]); - //} } } void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) { for (int i=0;in_iter < nof_iterations); + } while (h->n_iter[0] < nof_iterations); srslte_tdec_simd_decision_byte(h, output, nof_cb, long_cb); diff --git a/lib/src/phy/fec/turbodecoder_simd_inter.c b/lib/src/phy/fec/turbodecoder_simd_inter.c new file mode 100644 index 000000000..05d8b2cf5 --- /dev/null +++ b/lib/src/phy/fec/turbodecoder_simd_inter.c @@ -0,0 +1,299 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "srslte/phy/fec/turbodecoder_simd_inter.h" +#include "srslte/phy/utils/vector.h" + +#define TOTALTAIL 12 + +#ifdef LV_HAVE_SSE +#include + +void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb); +void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb); +void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb); +void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb); + + +static void map_sse_inter_dec(srslte_tdec_simd_inter_t * h, int16_t * input, int16_t * parity, int16_t * output, + uint32_t long_cb) +{ + map_see_inter_alpha(h, input, parity, long_cb); + map_sse_inter_beta(h, input, parity, output, long_cb); +} + +/************************************************ + * + * TURBO DECODER INTERFACE + * + ************************************************/ +int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h, uint32_t max_par_cb, uint32_t max_long_cb) +{ + int ret = -1; + bzero(h, sizeof(srslte_tdec_simd_inter_t)); + uint32_t len = max_long_cb + 12; + + h->max_long_cb = max_long_cb; + h->max_par_cb = max_par_cb; + + h->llr1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); + if (!h->llr1) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->llr2 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); + if (!h->llr2) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->w = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); + if (!h->w) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->syst0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); + if (!h->syst0) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->syst1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); + if (!h->syst1) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); + if (!h->parity0) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); + if (!h->parity1) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->alpha = srslte_vec_malloc(sizeof(int16_t) * 8*(len+12) * h->max_par_cb); + if (!h->alpha) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + + for (int i=0;iinterleaver[i], srslte_cbsegm_cbsize(i)) < 0) { + goto clean_and_exit; + } + srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i)); + } + h->current_cbidx = -1; + ret = 0; +clean_and_exit:if (ret == -1) { + srslte_tdec_simd_inter_free(h); + } + return ret; +} + +void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h) +{ + if (h->llr1) { + free(h->llr1); + } + if (h->llr2) { + free(h->llr2); + } + if (h->w) { + free(h->w); + } + if (h->syst0) { + free(h->syst0); + } + if (h->syst1) { + free(h->syst1); + } + if (h->parity0) { + free(h->parity0); + } + if (h->parity1) { + free(h->parity1); + } + if (h->alpha) { + free(h->alpha); + } + + for (int i=0;iinterleaver[i]); + } + + bzero(h, sizeof(srslte_tdec_simd_inter_t)); +} + + +/* Deinterleave for inter-frame parallelization */ +void extract_input(srslte_tdec_simd_inter_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb) +{ + for (int i=0;isyst0[h->max_par_cb*i+cbidx] = input[3*i+0]; + h->parity0[h->max_par_cb*i+cbidx] = input[3*i+1]; + h->parity1[h->max_par_cb*i+cbidx] = input[3*i+2]; + } + for (int i = long_cb; i < long_cb + 3; i++) { + h->syst0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)]; + h->syst1[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)]; + h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 1]; + h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 2]; + } +} + +void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h, int16_t *input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) +{ + + if (h->current_cbidx >= 0) { + + uint16_t *inter = h->interleaver[h->current_cbidx].forward; + uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; + + // Prepare systematic and parity bits for MAP DEC #1 + for (int i=0;in_iter[i] == 0) { + extract_input(h, input[i], i, long_cb); + } + srslte_vec_sum_sss(h->syst0, h->w, h->syst0, long_cb*h->max_par_cb); + } + + // Run MAP DEC #1 + map_sse_inter_dec(h, h->syst0, h->parity0, h->llr1, long_cb); + + // Prepare systematic and parity bits for MAP DEC #1 + sse_inter_extract_syst1(h, inter, long_cb); + + // Run MAP DEC #2 + map_sse_inter_dec(h, h->syst1, h->parity1, h->llr2, long_cb); + + // Update a-priori LLR from the last iteration + sse_inter_update_w(h, deinter, long_cb); + + } else { + fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_inter_reset() first\n"); + } +} + +int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h, uint32_t cb_idx) +{ + for (int i=0;icurrent_long_cb;i++) { + h->w[h->max_par_cb*i+cb_idx] = 0; + } + return 0; +} + +int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h, uint32_t long_cb) +{ + if (long_cb > h->max_long_cb) { + fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n", + h->max_long_cb); + return -1; + } + h->current_long_cb = long_cb; + h->current_cbidx = srslte_cbsegm_cbindex(long_cb); + if (h->current_cbidx < 0) { + fprintf(stderr, "Invalid CB length %d\n", long_cb); + return -1; + } + memset(h->w, 0, sizeof(int16_t) * long_cb * h->max_par_cb); + return 0; +} + +void srslte_tdec_simd_inter_decision_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb) +{ + uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; + uint32_t i; + for (i = 0; i < long_cb; i++) { + output[i] = (h->llr2[h->max_par_cb*deinter[i]+cb_idx] > 0) ? 1 : 0; + } +} + +void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) +{ + for (int i=0;iinterleaver[h->current_cbidx].reverse; + +#define indexOf_cb(idx, cb) (h->max_par_cb*(deinter[8*i+idx])+cb) + + // long_cb is always byte aligned + for (i = 0; i < long_cb/8; i++) { + uint8_t out0 = h->llr2[indexOf_cb(0, cb_idx)]>0?mask[0]:0; + uint8_t out1 = h->llr2[indexOf_cb(1, cb_idx)]>0?mask[1]:0; + uint8_t out2 = h->llr2[indexOf_cb(2, cb_idx)]>0?mask[2]:0; + uint8_t out3 = h->llr2[indexOf_cb(3, cb_idx)]>0?mask[3]:0; + uint8_t out4 = h->llr2[indexOf_cb(4, cb_idx)]>0?mask[4]:0; + uint8_t out5 = h->llr2[indexOf_cb(5, cb_idx)]>0?mask[5]:0; + uint8_t out6 = h->llr2[indexOf_cb(6, cb_idx)]>0?mask[6]:0; + uint8_t out7 = h->llr2[indexOf_cb(7, cb_idx)]>0?mask[7]:0; + + output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; + } +} + +void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) +{ + for (int i=0;i #include -#include "srslte/phy/fec/turbodecoder_sse.h" +#include "srslte/phy/fec/turbodecoder_simd.h" #include "srslte/phy/utils/vector.h" #include @@ -62,17 +62,20 @@ static void print_128i(__m128i x) { printf("]\n"); } */ +//#define use_beta_transposed_max + +#ifndef use_beta_transposed_max /* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */ static inline int16_t hMax(__m128i buffer) { - __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer); + __m128i tmp1 = _mm_sub_epi16(_mm_set1_epi16(0x7FFF), buffer); __m128i tmp3 = _mm_minpos_epu16(tmp1); return (int16_t)(_mm_cvtsi128_si32(tmp3)); } /* Computes beta values */ -void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) +void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) { int k; uint32_t end = long_cb + 3; @@ -138,8 +141,8 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) alphaPtr--;\ bp = _mm_add_epi16(bp, alpha_k);\ bn = _mm_add_epi16(bn, alpha_k);\ - output[k-d] = hMax(bn) - hMax(bp); - + output[k-d] = hMax(bn)-hMax(bp); + /* The tail does not require to load alpha or produce outputs. Only update * beta metrics accordingly */ for (k=end-1; k>=long_cb; k--) { @@ -154,6 +157,7 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) for (; k >= 0; k-=8) { gv = _mm_load_si128(gPtr); gPtr--; + BETA_STEP_CNT(0,0); BETA_STEP_CNT(1,1); BETA_STEP_CNT(2,2); @@ -165,14 +169,17 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) BETA_STEP_CNT(0,4); BETA_STEP_CNT(1,5); BETA_STEP_CNT(2,6); - BETA_STEP_CNT(3,7); + BETA_STEP_CNT(3,7); + norm = _mm_shuffle_epi8(beta_k, shuf_norm); beta_k = _mm_sub_epi16(beta_k, norm); } } +#endif + /* Computes alpha metrics */ -void map_gen_alpha(map_gen_t * s, uint32_t long_cb) +void map_sse_alpha(map_gen_t * s, uint32_t long_cb) { uint32_t k; int16_t *alpha = s->alpha; @@ -261,9 +268,9 @@ void map_gen_alpha(map_gen_t * s, uint32_t long_cb) } /* Compute branch metrics (gamma) */ -void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) +void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) { - __m128i res10, res20, res11, res21, res1, res2; + __m128i res00, res10, res01, res11, res0, res1; __m128i in, ap, pa, g1, g0; __m128i *inPtr = (__m128i*) input; @@ -271,10 +278,10 @@ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, __m128i *paPtr = (__m128i*) parity; __m128i *resPtr = (__m128i*) h->branch; - __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); - __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); - __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); - __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); + __m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); + __m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); + __m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); + __m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); for (int i=0;ialpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); - if (!h->alpha) { - perror("srslte_vec_malloc"); - return -1; - } - h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); - if (!h->branch) { - perror("srslte_vec_malloc"); - return -1; - } - h->max_long_cb = max_long_cb; - return 0; -} -void map_gen_free(map_gen_t * h) -{ - if (h->alpha) { - free(h->alpha); - } - if (h->branch) { - free(h->branch); - } - bzero(h, sizeof(map_gen_t)); -} -/* Runs one instance of a decoder */ -void map_gen_dec(map_gen_t * h, int16_t * input, int16_t *app, int16_t * parity, int16_t * output, - uint32_t long_cb) -{ - - // Compute branch metrics - map_gen_gamma(h, input, app, parity, long_cb); - // Forward recursion - map_gen_alpha(h, long_cb); - // Backwards recursion + LLR computation - map_gen_beta(h, output, long_cb); - -} - -/* Initializes the turbo decoder object */ -int srslte_tdec_sse_init(srslte_tdec_sse_t * h, uint32_t max_long_cb) -{ - int ret = -1; - bzero(h, sizeof(srslte_tdec_sse_t)); - uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL; - h->max_long_cb = max_long_cb; - - h->app1 = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->app1) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->app2 = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->app2) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->ext1 = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->ext1) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->ext2 = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->ext2) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->syst = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->syst) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->parity0) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->parity1) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } +/*********************** + * + * This is an attempt to parallelize the horizontal max + * by doing a 8x8 tranpose of the vectors and computing max + * in cascade. However since we need to store 16 registers + * for the positive and negative values the performance is not very good + */ - if (map_gen_init(&h->dec, h->max_long_cb)) { - goto clean_and_exit; - } - for (int i=0;iinterleaver[i], srslte_cbsegm_cbsize(i)) < 0) { - goto clean_and_exit; - } - srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i)); - } - h->current_cbidx = -1; - ret = 0; -clean_and_exit:if (ret == -1) { - srslte_tdec_sse_free(h); - } - return ret; -} +#ifdef use_beta_transposed_max -void srslte_tdec_sse_free(srslte_tdec_sse_t * h) +static inline __m128i transposed_max(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) { - if (h->app1) { - free(h->app1); - } - if (h->app2) { - free(h->app2); - } - if (h->ext1) { - free(h->ext1); - } - if (h->ext2) { - free(h->ext2); - } - if (h->syst) { - free(h->syst); - } - if (h->parity0) { - free(h->parity0); - } - if (h->parity1) { - free(h->parity1); - } - - map_gen_free(&h->dec); - - for (int i=0;iinterleaver[i]); - } - - bzero(h, sizeof(srslte_tdec_sse_t)); + // Transpose 8 vectors + __m128i t0 = _mm_unpacklo_epi16(a, b); + __m128i t1 = _mm_unpacklo_epi16(c, d); + __m128i t2 = _mm_unpacklo_epi16(e, f); + __m128i t3 = _mm_unpacklo_epi16(g, h); + __m128i t4 = _mm_unpackhi_epi16(a, b); + __m128i t5 = _mm_unpackhi_epi16(c, d); + __m128i t6 = _mm_unpackhi_epi16(e, f); + __m128i t7 = _mm_unpackhi_epi16(g, h); + + __m128i s0 = _mm_unpacklo_epi32(t0, t1); + __m128i s1 = _mm_unpackhi_epi32(t0, t1); + __m128i s2 = _mm_unpacklo_epi32(t2, t3); + __m128i s3 = _mm_unpackhi_epi32(t2, t3); + __m128i s4 = _mm_unpacklo_epi32(t4, t5); + __m128i s5 = _mm_unpackhi_epi32(t4, t5); + __m128i s6 = _mm_unpacklo_epi32(t6, t7); + __m128i s7 = _mm_unpackhi_epi32(t6, t7); + + __m128i x0 = _mm_unpacklo_epi64(s0, s2); + __m128i x1 = _mm_unpackhi_epi64(s0, s2); + __m128i x2 = _mm_unpacklo_epi64(s1, s3); + __m128i x3 = _mm_unpackhi_epi64(s1, s3); + __m128i x4 = _mm_unpacklo_epi64(s4, s6); + __m128i x5 = _mm_unpackhi_epi64(s4, s6); + __m128i x6 = _mm_unpacklo_epi64(s5, s7); + __m128i x7 = _mm_unpackhi_epi64(s5, s7); + + // Cascade max on the transposed vector + __m128i res = _mm_max_epi16(x0, + _mm_max_epi16(x1, + _mm_max_epi16(x2, + _mm_max_epi16(x3, + _mm_max_epi16(x4, + _mm_max_epi16(x5, + _mm_max_epi16(x6, + x7))))))); + + return res; } -/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into - * 3 buffers ready to be used by compute_gamma() - */ -void deinterleave_input(srslte_tdec_sse_t *h, int16_t *input, uint32_t long_cb) { - uint32_t i; +void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) +{ + int k; + uint32_t end = long_cb + 3; + const __m128i *alphaPtr = (const __m128i*) s->alpha; - __m128i *inputPtr = (__m128i*) input; - __m128i in0, in1, in2; - __m128i s0, s1, s2, s; - __m128i p00, p01, p02, p0; - __m128i p10, p11, p12, p1; - - __m128i *sysPtr = (__m128i*) h->syst; - __m128i *pa0Ptr = (__m128i*) h->parity0; - __m128i *pa1Ptr = (__m128i*) h->parity1; - - // pick bits 0, 3, 6 from 1st word - __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0); - // pick bits 1, 4, 7 from 2st word - __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff); - // pick bits 2, 5 from 3rd word - __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - - // pick bits 1, 4, 7 from 1st word - __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2); - // pick bits 2, 5, from 2st word - __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff); - // pick bits 0, 3, 6 from 3rd word - __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); + __m128i g, alpha_k; + __m128i bn, bn_0, bn_1, bn_2, bn_3, bn_4, bn_5, bn_6, bn_7; + __m128i bp, bp_0, bp_1, bp_2, bp_3, bp_4, bp_5, bp_6, bp_7; - // pick bits 2, 5 from 1st word - __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4); - // pick bits 0, 3, 6, from 2st word - __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff); - // pick bits 1, 4, 7 from 3rd word - __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - - // Split systematic and parity bits - for (i = 0; i < long_cb/8; i++) { - - in0 = _mm_load_si128(inputPtr); inputPtr++; - in1 = _mm_load_si128(inputPtr); inputPtr++; - in2 = _mm_load_si128(inputPtr); inputPtr++; - - /* Deinterleave Systematic bits */ - s0 = _mm_shuffle_epi8(in0, s0_mask); - s1 = _mm_shuffle_epi8(in1, s1_mask); - s2 = _mm_shuffle_epi8(in2, s2_mask); - s = _mm_or_si128(s0, s1); - s = _mm_or_si128(s, s2); - - _mm_store_si128(sysPtr, s); - sysPtr++; - - /* Deinterleave parity 0 bits */ - p00 = _mm_shuffle_epi8(in0, p00_mask); - p01 = _mm_shuffle_epi8(in1, p01_mask); - p02 = _mm_shuffle_epi8(in2, p02_mask); - p0 = _mm_or_si128(p00, p01); - p0 = _mm_or_si128(p0, p02); - - _mm_store_si128(pa0Ptr, p0); - pa0Ptr++; - - /* Deinterleave parity 1 bits */ - p10 = _mm_shuffle_epi8(in0, p10_mask); - p11 = _mm_shuffle_epi8(in1, p11_mask); - p12 = _mm_shuffle_epi8(in2, p12_mask); - p1 = _mm_or_si128(p10, p11); - p1 = _mm_or_si128(p1, p12); + /* Define the shuffle constant for the positive beta */ + __m128i shuf_bp = _mm_set_epi8( + 15, 14, // 7 + 7, 6, // 3 + 5, 4, // 2 + 13, 12, // 6 + 11, 10, // 5 + 3, 2, // 1 + 1, 0, // 0 + 9, 8 // 4 + ); - _mm_store_si128(pa1Ptr, p1); - pa1Ptr++; + /* Define the shuffle constant for the negative beta */ + __m128i shuf_bn = _mm_set_epi8( + 7, 6, // 3 + 15, 14, // 7 + 13, 12, // 6 + 5, 4, // 2 + 3, 2, // 1 + 11, 10, // 5 + 9, 8, // 4 + 1, 0 // 0 + ); + + alphaPtr += long_cb-1; - } + /* Define shuffle for branch costs */ + __m128i shuf_g[4]; + shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); + shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); + shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); + shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); + __m128i gv; + int16_t *b = &s->branch[2*long_cb-8]; + __m128i *gPtr = (__m128i*) b; + /* Define shuffle for beta normalization */ + __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); - for (i = 0; i < 3; i++) { - h->syst[i+long_cb] = input[3*long_cb + 2*i]; - h->parity0[i+long_cb] = input[3*long_cb + 2*i + 1]; - } - for (i = 0; i < 3; i++) { - h->app2[i+long_cb] = input[3*long_cb + 6 + 2*i]; - h->parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1]; - } - -} - -/* Runs 1 turbo decoder iteration */ -void srslte_tdec_sse_iteration(srslte_tdec_sse_t * h, int16_t * input, uint32_t long_cb) -{ - - if (h->current_cbidx >= 0) { - uint16_t *inter = h->interleaver[h->current_cbidx].forward; - uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; - - if (h->n_iter == 0) { - deinterleave_input(h, input, long_cb); - } - - // Add apriori information to decoder 1 - if (h->n_iter > 0) { - srslte_vec_sub_sss(h->app1, h->ext1, h->app1, long_cb); - } - - // Run MAP DEC #1 - if (h->n_iter == 0) { - map_gen_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, long_cb); - } else { - map_gen_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, long_cb); - } - - // Convert aposteriori information into extrinsic information - if (h->n_iter > 0) { - srslte_vec_sub_sss(h->ext1, h->app1, h->ext1, long_cb); - } - - // Interleave extrinsic output of DEC1 to form apriori info for decoder 2 - srslte_vec_lut_sss(h->ext1, deinter, h->app2, long_cb); - - // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits - map_gen_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, long_cb); - - // Deinterleaved extrinsic bits become apriori info for decoder 1 - srslte_vec_lut_sss(h->ext2, inter, h->app1, long_cb); - - h->n_iter++; - } else { - fprintf(stderr, "Error CB index not set (call srslte_tdec_sse_reset() first\n"); - } -} + /* This defines a beta computation step: + * Adds and substracts the branch metrics to the previous beta step, + * shuffles the states according to the trellis path and selects maximum state + */ +#define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\ + bn = _mm_sub_epi16(beta_k, g);\ + bp = _mm_shuffle_epi8(bp, shuf_bp);\ + bn = _mm_shuffle_epi8(bn, shuf_bn);\ + beta_k = _mm_max_epi16(bp, bn); -/* Resets the decoder and sets the codeblock length */ -int srslte_tdec_sse_reset(srslte_tdec_sse_t * h, uint32_t long_cb) -{ - if (long_cb > h->max_long_cb) { - fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n", - h->max_long_cb); - return -1; - } - h->n_iter = 0; - h->current_cbidx = srslte_cbsegm_cbindex(long_cb); - if (h->current_cbidx < 0) { - fprintf(stderr, "Invalid CB length %d\n", long_cb); - return -1; - } - return 0; -} + /* Loads the alpha metrics from memory and adds them to the temporal bn and bp + * metrics. + */ +#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\ + BETA_STEP(g)\ + alpha_k = _mm_load_si128(alphaPtr);\ + alphaPtr--;\ + bp_##d = _mm_add_epi16(bp, alpha_k);\ + bn_##d = _mm_add_epi16(bn, alpha_k);\ -void srslte_tdec_sse_decision(srslte_tdec_sse_t * h, uint8_t *output, uint32_t long_cb) -{ - __m128i zero = _mm_set1_epi16(0); - __m128i lsb_mask = _mm_set1_epi16(1); + /* The tail does not require to load alpha or produce outputs. Only update + * beta metrics accordingly */ + for (k=end-1; k>=long_cb; k--) { + int16_t g0 = s->branch[2*k]; + int16_t g1 = s->branch[2*k+1]; + g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1); + BETA_STEP(g); + } - __m128i *appPtr = (__m128i*) h->app1; - __m128i *outPtr = (__m128i*) output; - __m128i ap, out, out0, out1; - - for (uint32_t i = 0; i < long_cb/16; i++) { - ap = _mm_load_si128(appPtr); appPtr++; - out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); - ap = _mm_load_si128(appPtr); appPtr++; - out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); + /* We inline 2 trelis steps for each normalization */ + __m128i norm; + __m128i *outPtr = (__m128i*) &output[long_cb-8]; + for (; k >= 0; k-=8) { + gv = _mm_load_si128(gPtr); + gPtr--; - out = _mm_packs_epi16(out0, out1); - _mm_store_si128(outPtr, out); - outPtr++; - } - if (long_cb%16) { - for (int i=0;i<8;i++) { - output[long_cb-8+i] = h->app1[long_cb-8+i]>0?1:0; - } - } -} - -void srslte_tdec_sse_decision_byte(srslte_tdec_sse_t * h, uint8_t *output, uint32_t long_cb) -{ - uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; - - // long_cb is always byte aligned - for (uint32_t i = 0; i < long_cb/8; i++) { - uint8_t out0 = h->app1[8*i+0]>0?mask[0]:0; - uint8_t out1 = h->app1[8*i+1]>0?mask[1]:0; - uint8_t out2 = h->app1[8*i+2]>0?mask[2]:0; - uint8_t out3 = h->app1[8*i+3]>0?mask[3]:0; - uint8_t out4 = h->app1[8*i+4]>0?mask[4]:0; - uint8_t out5 = h->app1[8*i+5]>0?mask[5]:0; - uint8_t out6 = h->app1[8*i+6]>0?mask[6]:0; - uint8_t out7 = h->app1[8*i+7]>0?mask[7]:0; + BETA_STEP_CNT(0,0); + BETA_STEP_CNT(1,1); + BETA_STEP_CNT(2,2); + BETA_STEP_CNT(3,3); + norm = _mm_shuffle_epi8(beta_k, shuf_norm); + beta_k = _mm_sub_epi16(beta_k, norm); + gv = _mm_load_si128(gPtr); + gPtr--; + BETA_STEP_CNT(0,4); + BETA_STEP_CNT(1,5); + BETA_STEP_CNT(2,6); + BETA_STEP_CNT(3,7); + norm = _mm_shuffle_epi8(beta_k, shuf_norm); + beta_k = _mm_sub_epi16(beta_k, norm); - output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; - } + __m128i bn_transp = transposed_max(bn_7, bn_6, bn_5, bn_4, bn_3, bn_2, bn_1, bn_0); + __m128i bp_transp = transposed_max(bp_7, bp_6, bp_5, bp_4, bp_3, bp_2, bp_1, bp_0); + __m128i outval = _mm_sub_epi16(bp_transp,bn_transp); + _mm_store_si128(outPtr, outval); + outPtr--; + } } +#endif -/* Runs nof_iterations iterations and decides the output bits */ -int srslte_tdec_sse_run_all(srslte_tdec_sse_t * h, int16_t * input, uint8_t *output, - uint32_t nof_iterations, uint32_t long_cb) -{ - if (srslte_tdec_sse_reset(h, long_cb)) { - return SRSLTE_ERROR; - } - do { - srslte_tdec_sse_iteration(h, input, long_cb); - } while (h->n_iter < nof_iterations); - srslte_tdec_sse_decision_byte(h, output, long_cb); - - return SRSLTE_SUCCESS; -} #endif diff --git a/lib/src/phy/fec/turbodecoder_sse_inter.c b/lib/src/phy/fec/turbodecoder_sse_inter.c new file mode 100644 index 000000000..bb1168368 --- /dev/null +++ b/lib/src/phy/fec/turbodecoder_sse_inter.c @@ -0,0 +1,198 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "srslte/phy/fec/turbodecoder_simd_inter.h" +#include "srslte/phy/utils/vector.h" + + +#define NCB 8 + +#define INF 10000 + +#ifdef LV_HAVE_SSE +#include + +void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb) +{ + __m128i *llr1Ptr = (__m128i*) h->llr1; + __m128i *wPtr = (__m128i*) h->w; + __m128i *syst1Ptr = (__m128i*) h->syst1; + + for (int i = 0; i < long_cb; i++) { + __m128i llr1 = _mm_load_si128(&llr1Ptr[inter[i]]); + __m128i w = _mm_load_si128(&wPtr[inter[i]]); + _mm_store_si128(syst1Ptr++, _mm_sub_epi16(llr1, w)); + } +} + +void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb) +{ + __m128i *llr1Ptr = (__m128i*) h->llr1; + __m128i *llr2Ptr = (__m128i*) h->llr2; + __m128i *wPtr = (__m128i*) h->w; + __m128i *syst1Ptr = (__m128i*) h->syst1; + + for (int i = 0; i < long_cb; i++) { + __m128i llr1 = _mm_load_si128(llr1Ptr++); + __m128i w = _mm_load_si128(wPtr++); + __m128i llr2 = _mm_load_si128(&llr2Ptr[deinter[i]]); + + _mm_store_si128(syst1Ptr++, _mm_add_epi16(w, _mm_sub_epi16(llr2, llr1))); + } +} + +/* Computes beta values */ +void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb) +{ + __m128i m_b[8], new[8], old[8], max1[8], max0[8]; + __m128i x, y, xy; + __m128i m1, m0; + uint32_t end = long_cb + 3; + uint32_t i; + + __m128i *inputPtr = (__m128i*) input; + __m128i *parityPtr = (__m128i*) parity; + __m128i *outputPtr = (__m128i*) output; + __m128i *alphaPtr = (__m128i*) s->alpha; + + for (int k = end - 1; k >= 0; k--) { + x = _mm_load_si128(inputPtr++); + y = _mm_load_si128(parityPtr++); + + xy = _mm_add_epi16(x,y); + + m_b[0] = _mm_add_epi16(old[4], xy); + m_b[1] = old[4]; + m_b[2] = _mm_add_epi16(old[5], y); + m_b[3] = _mm_add_epi16(old[5], x); + m_b[4] = _mm_add_epi16(old[6], x); + m_b[5] = _mm_add_epi16(old[6], y); + m_b[6] = old[7]; + m_b[7] = _mm_add_epi16(old[7], xy); + + new[0] = old[0]; + new[1] = _mm_add_epi16(old[0], xy); + new[2] = _mm_add_epi16(old[1], x); + new[3] = _mm_add_epi16(old[1], y); + new[4] = _mm_add_epi16(old[2], y); + new[5] = _mm_add_epi16(old[2], x); + new[6] = _mm_add_epi16(old[3], xy); + new[7] = old[3]; + + for (i = 0; i < 8; i++) { + __m128i alpha = _mm_load_si128(alphaPtr++); + max0[i] = _mm_add_epi16(alpha, m_b[i]); + max1[i] = _mm_add_epi16(alpha, new[i]); + } + + m1 = _mm_max_epi16(max1[0], max1[1]); + m0 = _mm_max_epi16(max0[0], max0[1]); + + for (i = 2; i < 8; i++) { + m1 = _mm_max_epi16(m1, max1[i]); + m0 = _mm_max_epi16(m0, max0[i]); + } + + for (i = 0; i < 8; i++) { + new[i] = _mm_max_epi16(m_b[i], new[i]); + old[i] = new[i]; + } + + __m128i out = _mm_sub_epi16(m1, m0); + _mm_store_si128(outputPtr++, out); + + // normalize + if ((k%4)==0) { + for (int i=1;i<8;i++) { + _mm_sub_epi16(old[i], old[0]); + } + } + } +} + +/* Computes alpha metrics */ +void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb) +{ + __m128i m_b[8], new[8], old[8]; + __m128i x, y, xy; + uint32_t k; + + __m128i *inputPtr = (__m128i*) input; + __m128i *parityPtr = (__m128i*) parity; + __m128i *alphaPtr = (__m128i*) s->alpha; + + old[0] = _mm_set1_epi16(0); + for (int i = 1; i < 8; i++) { + old[i] = _mm_set1_epi16(-INF); + } + + for (k = 0; k < long_cb; k++) { + x = _mm_load_si128(inputPtr++); + y = _mm_load_si128(parityPtr++); + + xy = _mm_add_epi16(x,y); + + m_b[0] = old[0]; + m_b[1] = _mm_add_epi16(old[3], y); + m_b[2] = _mm_add_epi16(old[4], y); + m_b[3] = old[7]; + m_b[4] = old[1]; + m_b[5] = _mm_add_epi16(old[2], y); + m_b[6] = _mm_add_epi16(old[5], y); + m_b[7] = old[6]; + + new[0] = _mm_add_epi16(old[1], xy); + new[1] = _mm_add_epi16(old[2], x); + new[2] = _mm_add_epi16(old[5], x); + new[3] = _mm_add_epi16(old[6], xy); + new[4] = _mm_add_epi16(old[0], xy); + new[5] = _mm_add_epi16(old[3], x); + new[6] = _mm_add_epi16(old[4], x); + new[7] = _mm_add_epi16(old[7], xy); + + for (int i = 0; i < 8; i++) { + new[i] = _mm_max_epi16(m_b[i], new[i]); + old[i] = new[i]; + _mm_store_si128(alphaPtr++, old[i]); + } + + // normalize + if ((k%4)==0) { + for (int i=1;i<8;i++) { + _mm_sub_epi16(old[i], old[0]); + } + } + } +} + +#endif diff --git a/lib/src/phy/phch/sch.c b/lib/src/phy/phch/sch.c index ba4c8951b..5ae539a22 100644 --- a/lib/src/phy/phch/sch.c +++ b/lib/src/phy/phch/sch.c @@ -311,8 +311,124 @@ static int encode_tb(srslte_sch_t *q, return encode_tb_off(q, soft_buffer, cb_segm, Qm, rv, nof_e_bits, data, e_bits, 0); } +bool decode_tb_cb(srslte_sch_t *q, + srslte_softbuffer_rx_t *softbuffer, srslte_cbsegm_t *cb_segm, + uint32_t Qm, uint32_t rv, uint32_t nof_e_bits, + int16_t *e_bits, uint8_t *data, + uint32_t cb_size_group, uint8_t parity[3]) +{ + + bool cb_map[SRSLTE_MAX_CODEBLOCKS]; + + bzero(cb_map, sizeof(bool)*SRSLTE_MAX_CODEBLOCKS); + + uint32_t cb_idx[SRSLTE_TDEC_NPAR]; + int16_t *decoder_input[SRSLTE_TDEC_NPAR]; + + uint32_t nof_cb = cb_size_group?cb_segm->C2:cb_segm->C1; + uint32_t first_cb = cb_size_group?cb_segm->C1:0; + uint32_t cb_len = cb_size_group?cb_segm->K2:cb_segm->K1; + uint32_t cb_len_idx = cb_size_group?cb_segm->K2_idx:cb_segm->K1_idx; + + uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24); + uint32_t Gp = nof_e_bits / Qm; + uint32_t gamma = cb_segm->C>0?Gp%cb_segm->C:Gp; + uint32_t n_e = Qm * (Gp/cb_segm->C); + if (nof_cb > SRSLTE_MAX_CODEBLOCKS) { + fprintf(stderr, "Error SRSLTE_MAX_CODEBLOCKS=%d\n", SRSLTE_MAX_CODEBLOCKS); + return false; + } + + for (int i=0;idecoder, cb_len); + + uint32_t remaining_cb = nof_cb; + + while(remaining_cb>0) { + uint32_t npar = SRSLTE_MIN(remaining_cb, SRSLTE_TDEC_NPAR); + + // Unratematch the codeblocks left to decode + for (int i=0;i cb_segm->C - gamma) { + n_e2 = n_e+Qm; + rp = (cb_segm->C - gamma)*n_e + (cb_idx[i]-(cb_segm->C - gamma))*n_e2; + } + + INFO("CB %d: rp=%d, n_e=%d, i=%d\n", cb_idx[i], rp, n_e2, i); + if (srslte_rm_turbo_rx_lut(&e_bits[rp], softbuffer->buffer_f[cb_idx[i]], n_e2, cb_len_idx, rv)) { + fprintf(stderr, "Error in rate matching\n"); + return SRSLTE_ERROR; + } + + decoder_input[i] = softbuffer->buffer_f[cb_idx[i]]; + } + + // Run 1 iteration for up to TDEC_NPAR codeblocks + if (SRSLTE_TDEC_NPAR > 1) { + INFO("Processing %d CBs, index %d,%d\n", npar, cb_idx[0], cb_idx[1]); + } + srslte_tdec_iteration_par(&q->decoder, decoder_input, npar, cb_len); + + // Decide output bits and compute CRC + for (int i=0;idecoder, q->cb_in, i, cb_len); + uint32_t len_crc; + srslte_crc_t *crc_ptr; + + if (cb_segm->C > 1) { + len_crc = cb_len; + crc_ptr = &q->crc_cb; + } else { + len_crc = cb_segm->tbs+24; + crc_ptr = &q->crc_tb; + } + + // CRC is OK + if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) { + + uint32_t wp = cb_idx[i]*rlen; + + // If it's not the last CB, copy data to another buffer and remove CRC */ + if (cb_idx[i] < cb_segm->C - 1) { + memcpy(&data[wp/8], q->cb_in, rlen/8 * sizeof(uint8_t)); + // If it's the last CB Append Transport Block parity bits to the last CB + } else { + memcpy(&data[wp/8], q->cb_in, (rlen - 24)/8 * sizeof(uint8_t)); + memcpy(parity, &q->cb_in[(rlen - 24)/8], 3 * sizeof(uint8_t)); + } + + // Reset number of iterations for that CB in the decoder + srslte_tdec_reset_cb(&q->decoder, i); + remaining_cb--; + + // CRC is error and exceeded maximum iterations for this CB. + // Early stop the whole transport block. + } else if (srslte_tdec_get_nof_iterations_cb(&q->decoder, i) >= q->max_iterations) { + INFO("CB %d: Error. TB is erroneous.\n", cb_idx[i]); + return false; + } + } + } + + return true; +} /** * Decode a transport block according to 36.212 5.3.2 @@ -332,10 +448,6 @@ static int decode_tb(srslte_sch_t *q, uint32_t Qm, uint32_t rv, uint32_t nof_e_bits, int16_t *e_bits, uint8_t *data) { - uint8_t parity[3] = {0, 0, 0}; - uint32_t par_rx, par_tx; - uint32_t i; - uint32_t cb_len, rp, wp, rlen, n_e; if (q != NULL && data != NULL && @@ -343,17 +455,11 @@ static int decode_tb(srslte_sch_t *q, e_bits != NULL && cb_segm != NULL) { - + if (cb_segm->tbs == 0 || cb_segm->C == 0) { return SRSLTE_SUCCESS; } - rp = 0; - rp = 0; - wp = 0; - uint32_t Gp = nof_e_bits / Qm; - uint32_t gamma=Gp; - if (cb_segm->F) { fprintf(stderr, "Error filler bits are not supported. Use standard TBS\n"); return SRSLTE_ERROR; @@ -363,128 +469,41 @@ static int decode_tb(srslte_sch_t *q, fprintf(stderr, "Error number of CB (%d) exceeds soft buffer size (%d CBs)\n", cb_segm->C, softbuffer->max_cb); return SRSLTE_ERROR; } + + uint8_t parity[3] = {0, 0, 0}; + bool crc_ok = true; - if (cb_segm->C>0) { - gamma = Gp%cb_segm->C; - } + uint32_t nof_cb_groups = cb_segm->C2>0?2:1; - bool early_stop = true; - for (i = 0; i < cb_segm->C && early_stop; i++) { - - /* Get read/write lengths */ - uint32_t cblen_idx; - if (i < cb_segm->C2) { - cb_len = cb_segm->K2; - cblen_idx = cb_segm->K2_idx; - } else { - cb_len = cb_segm->K1; - cblen_idx = cb_segm->K1_idx; - } - - if (cb_segm->C == 1) { - rlen = cb_len; - } else { - rlen = cb_len - 24; - } - - if (i <= cb_segm->C - gamma - 1) { - n_e = Qm * (Gp/cb_segm->C); - } else { - n_e = Qm * ((uint32_t) ceilf((float) Gp/cb_segm->C)); - } - - /* Rate Unmatching */ - if (srslte_rm_turbo_rx_lut(&e_bits[rp], softbuffer->buffer_f[i], n_e, cblen_idx, rv)) { - fprintf(stderr, "Error in rate matching\n"); - return SRSLTE_ERROR; - } - - if (SRSLTE_VERBOSE_ISDEBUG()) { - char tmpstr[64]; - snprintf(tmpstr,64,"rmout_%d.dat",i); - DEBUG("SAVED FILE %s: Encoded turbo code block %d\n", tmpstr, i); - srslte_vec_save_file(tmpstr, softbuffer->buffer_f[i], (3*cb_len+12)*sizeof(int16_t)); - } - - /* Turbo Decoding with CRC-based early stopping */ - q->nof_iterations = 0; - uint32_t len_crc; - srslte_crc_t *crc_ptr; - early_stop = false; - - srslte_tdec_reset(&q->decoder, cb_len); - - do { - srslte_tdec_iteration(&q->decoder, softbuffer->buffer_f[i], cb_len); - q->nof_iterations++; - - if (cb_segm->C > 1) { - len_crc = cb_len; - crc_ptr = &q->crc_cb; - } else { - len_crc = cb_segm->tbs+24; - crc_ptr = &q->crc_tb; - } - - srslte_tdec_decision_byte(&q->decoder, q->cb_in, cb_len); - - /* Check Codeblock CRC and stop early if correct */ - if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) { - early_stop = true; - } - - } while (q->nof_iterations < q->max_iterations && !early_stop); - q->average_nof_iterations = SRSLTE_VEC_EMA((float) q->nof_iterations, q->average_nof_iterations, 0.2); - - INFO("CB#%d: cb_len: %d, rlen: %d, wp: %d, rp: %d, E: %d, n_iters=%d\n", i, - cb_len, rlen, wp, rp, n_e, q->nof_iterations); - - - // If CB CRC is not correct, early_stop will be false and wont continue with rest of CBs - - /* Copy data to another buffer, removing the Codeblock CRC */ - if (i < cb_segm->C - 1) { - memcpy(&data[wp/8], q->cb_in, rlen/8 * sizeof(uint8_t)); - } else { - /* Append Transport Block parity bits to the last CB */ - memcpy(&data[wp/8], q->cb_in, (rlen - 24)/8 * sizeof(uint8_t)); - memcpy(parity, &q->cb_in[(rlen - 24)/8], 3 * sizeof(uint8_t)); - } - - if (SRSLTE_VERBOSE_ISDEBUG()) { - early_stop = true; - } - - /* Set read/write pointers */ - wp += rlen; - rp += n_e; + // Process Codeblocks in groups of equal CB size to parallelize according to SRSLTE_TDEC_NPAR + for (uint32_t i=0;icrc_tb, data, cb_segm->tbs); // check parity bits par_tx = ((uint32_t) parity[0])<<16 | ((uint32_t) parity[1])<<8 | ((uint32_t) parity[2]); - + if (!par_rx) { - INFO("Warning: Received all-zero transport block\n\n", 0); + INFO("Warning: Received all-zero transport block\n\n",0); } if (par_rx == par_tx) { - INFO("TB decoded OK\n",i); + INFO("TB decoded OK\n",0); return SRSLTE_SUCCESS; } else { INFO("Error in TB parity: par_tx=0x%x, par_rx=0x%x\n", par_tx, par_rx); return SRSLTE_ERROR; } - - } + } else { + return SRSLTE_ERROR; + } } else { return SRSLTE_ERROR_INVALID_INPUTS; }