From c1ef9da32a17ee5e4a45fd79c84170fefe676950 Mon Sep 17 00:00:00 2001 From: Ismael Gomez Date: Sun, 11 Jun 2017 16:59:31 -0500 Subject: [PATCH] avx turbo decoder working in tests --- .../srslte/phy/fec/turbodecoder_simd.h | 135 +++++ lib/src/phy/fec/test/turbodecoder_test.c | 55 +- lib/src/phy/fec/turbodecoder.c | 67 ++- lib/src/phy/fec/turbodecoder_avx.c | 401 +++++++++++++++ lib/src/phy/fec/turbodecoder_simd.c | 486 ++++++++++++++++++ lib/src/phy/fec/turbodecoder_sse.c | 15 +- 6 files changed, 1121 insertions(+), 38 deletions(-) create mode 100644 lib/include/srslte/phy/fec/turbodecoder_simd.h create mode 100644 lib/src/phy/fec/turbodecoder_avx.c create mode 100644 lib/src/phy/fec/turbodecoder_simd.c diff --git a/lib/include/srslte/phy/fec/turbodecoder_simd.h b/lib/include/srslte/phy/fec/turbodecoder_simd.h new file mode 100644 index 000000000..7ca3914ca --- /dev/null +++ b/lib/include/srslte/phy/fec/turbodecoder_simd.h @@ -0,0 +1,135 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +/********************************************************************************************** + * File: turbodecoder.h + * + * Description: Turbo Decoder. + * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent + * encoders and one turbo code internal interleaver. The coding rate of turbo + * encoder is 1/3. + * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. + * + * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 + *********************************************************************************************/ + +#ifndef TURBODECODER_SSE_ +#define TURBODECODER_SSE_ + +#include "srslte/config.h" +#include "srslte/phy/fec/tc_interl.h" +#include "srslte/phy/fec/cbsegm.h" + +//#define ENABLE_SIMD_INTER + +// The constant SRSLTE_TDEC_NPAR defines the maximum number of parallel CB supported by all SIMD decoders +#ifdef ENABLE_SIMD_INTER + #include "srslte/phy/fec/turbodecoder_simd_inter.h" + #if LV_HAVE_AVX2 + #define SRSLTE_TDEC_NPAR_INTRA 2 + #else + #define SRSLTE_TDEC_NPAR_INTRA 1 + #endif +#else + #if LV_HAVE_AVX2 + #define SRSLTE_TDEC_NPAR 2 + #else + #define SRSLTE_TDEC_NPAR 1 + #endif +#endif + +#define SRSLTE_TCOD_RATE 3 +#define SRSLTE_TCOD_TOTALTAIL 12 + +#define SRSLTE_TCOD_MAX_LEN_CB 6144 +#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL) + +typedef struct SRSLTE_API { + uint32_t max_long_cb; + uint32_t max_par_cb; + int16_t *alpha; + int16_t *branch; +} map_gen_t; + +typedef struct SRSLTE_API { + uint32_t max_long_cb; + uint32_t max_par_cb; + + map_gen_t dec; + + int16_t *app1[SRSLTE_TDEC_NPAR]; + int16_t *app2[SRSLTE_TDEC_NPAR]; + int16_t *ext1[SRSLTE_TDEC_NPAR]; + int16_t *ext2[SRSLTE_TDEC_NPAR]; + int16_t *syst[SRSLTE_TDEC_NPAR]; + int16_t *parity0[SRSLTE_TDEC_NPAR]; + int16_t *parity1[SRSLTE_TDEC_NPAR]; + + int cb_mask; + int current_cbidx; + srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES]; + int n_iter[SRSLTE_TDEC_NPAR]; +} srslte_tdec_simd_t; + +SRSLTE_API int srslte_tdec_simd_init(srslte_tdec_simd_t * h, + uint32_t max_par_cb, + uint32_t max_long_cb); + +SRSLTE_API void srslte_tdec_simd_free(srslte_tdec_simd_t * h); + +SRSLTE_API int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, + uint32_t long_cb); + +SRSLTE_API int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h, + uint32_t cb_idx); + +SRSLTE_API int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h, + uint32_t cb_idx); + +SRSLTE_API void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, + int16_t * input[SRSLTE_TDEC_NPAR], + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h, + uint8_t *output, + uint32_t cbidx, + uint32_t long_cb); + +SRSLTE_API int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, + int16_t * input[SRSLTE_TDEC_NPAR], + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t nof_iterations, + uint32_t long_cb); + +#endif diff --git a/lib/src/phy/fec/test/turbodecoder_test.c b/lib/src/phy/fec/test/turbodecoder_test.c index 1d4fd7024..dad9a07c8 100644 --- a/lib/src/phy/fec/test/turbodecoder_test.c +++ b/lib/src/phy/fec/test/turbodecoder_test.c @@ -46,6 +46,7 @@ uint32_t seed = 0; int K = -1; #define MAX_ITERATIONS 10 +int nof_cb = 1; int nof_iterations = MAX_ITERATIONS; int test_known_data = 0; int test_errors = 0; @@ -59,6 +60,7 @@ void usage(char *prog) { printf("Usage: %s [nlesv]\n", prog); printf( "\t-k Test with known data (ignores frame_length) [Default disabled]\n"); + printf("\t-c nof_cb in parallel [Default %d]\n", nof_cb); printf("\t-i nof_iterations [Default %d]\n", nof_iterations); printf("\t-n nof_frames [Default %d]\n", nof_frames); printf("\t-N nof_repetitions [Default %d]\n", nof_repetitions); @@ -70,8 +72,11 @@ void usage(char *prog) { void parse_args(int argc, char **argv) { int opt; - while ((opt = getopt(argc, argv, "inNlstvekt")) != -1) { + while ((opt = getopt(argc, argv, "cinNlstvekt")) != -1) { switch (opt) { + case 'c': + nof_cb = atoi(argv[optind]); + break; case 'n': nof_frames = atoi(argv[optind]); break; @@ -112,7 +117,7 @@ int main(int argc, char **argv) { float *llr; short *llr_s; uint8_t *llr_c; - uint8_t *data_tx, *data_rx, *data_rx_bytes, *symbols; + uint8_t *data_tx, *data_rx, *data_rx_bytes[SRSLTE_TDEC_NPAR], *symbols; uint32_t i, j; float var[SNR_POINTS]; uint32_t snr_points; @@ -154,10 +159,12 @@ int main(int argc, char **argv) { perror("malloc"); exit(-1); } - data_rx_bytes = srslte_vec_malloc(frame_length * sizeof(uint8_t)); - if (!data_rx_bytes) { - perror("malloc"); - exit(-1); + for (int cb=0;cbtdec_sse, max_long_cb); + return srslte_tdec_simd_init(&h->tdec_simd, max_long_cb); #else h->input_conv = srslte_vec_malloc(sizeof(float) * (3*max_long_cb+12)); if (!h->input_conv) { @@ -56,7 +56,7 @@ int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) { void srslte_tdec_free(srslte_tdec_t * h) { #ifdef LV_HAVE_SSE - srslte_tdec_sse_free(&h->tdec_sse); + srslte_tdec_simd_free(&h->tdec_simd); #else if (h->input_conv) { free(h->input_conv); @@ -68,45 +68,74 @@ void srslte_tdec_free(srslte_tdec_t * h) { int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) { #ifdef LV_HAVE_SSE - return srslte_tdec_sse_reset(&h->tdec_sse, long_cb); + return srslte_tdec_simd_reset(&h->tdec_simd, long_cb); #else return srslte_tdec_gen_reset(&h->tdec_gen, long_cb); #endif } -void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) { +void srslte_tdec_iteration_par(srslte_tdec_t * h, int16_t* input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) { #ifdef LV_HAVE_SSE - srslte_tdec_sse_iteration(&h->tdec_sse, input, long_cb); + srslte_tdec_simd_iteration(&h->tdec_simd, input, nof_cb, long_cb); #else - srslte_vec_convert_if(input, h->input_conv, 0.01, 3*long_cb+12); + srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12); srslte_tdec_gen_iteration(&h->tdec_gen, h->input_conv, long_cb); #endif } -void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { +void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) { + int16_t *input_par[SRSLTE_TDEC_NPAR]; + input_par[0] = input; + return srslte_tdec_iteration_par(h, input_par, 1, long_cb); +} + +void srslte_tdec_decision_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) { #ifdef LV_HAVE_SSE - return srslte_tdec_sse_decision(&h->tdec_sse, output, long_cb); + return srslte_tdec_simd_decision(&h->tdec_simd, output, nof_cb, long_cb); #else - return srslte_tdec_gen_decision(&h->tdec_gen, output, long_cb); + return srslte_tdec_gen_decision(&h->tdec_gen, output[0], long_cb); #endif +} + +void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { + uint8_t *output_par[SRSLTE_TDEC_NPAR]; + output_par[0] = output; + srslte_tdec_decision_par(h, output_par, 1, long_cb); +} +void srslte_tdec_decision_byte_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) { +#ifdef LV_HAVE_SSE + srslte_tdec_simd_decision_byte(&h->tdec_simd, output, nof_cb, long_cb); +#else + srslte_tdec_gen_decision_byte(&h->tdec_gen, output[0], long_cb); +#endif } void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { + uint8_t *output_par[SRSLTE_TDEC_NPAR]; + output_par[0] = output; + srslte_tdec_decision_byte_par(h, output_par, 1, long_cb); +} + +int srslte_tdec_run_all_par(srslte_tdec_t * h, int16_t * input[SRSLTE_TDEC_NPAR], + uint8_t *output[SRSLTE_TDEC_NPAR], + uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb) { #ifdef LV_HAVE_SSE - return srslte_tdec_sse_decision_byte(&h->tdec_sse, output, long_cb); + return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, nof_cb, long_cb); #else - return srslte_tdec_gen_decision_byte(&h->tdec_gen, output, long_cb); + srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12); + return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output[0], nof_iterations, long_cb); #endif - } int srslte_tdec_run_all(srslte_tdec_t * h, int16_t * input, uint8_t *output, uint32_t nof_iterations, uint32_t long_cb) { -#ifdef LV_HAVE_SSE - return srslte_tdec_sse_run_all(&h->tdec_sse, input, output, nof_iterations, long_cb); -#else - srslte_vec_convert_if(input, h->input_conv, 0.01, 3*long_cb+12); - return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output, nof_iterations, long_cb); -#endif + uint8_t *output_par[SRSLTE_TDEC_NPAR]; + output_par[0] = output; + int16_t *input_par[SRSLTE_TDEC_NPAR]; + input_par[0] = input; + + return srslte_tdec_run_all_par(h, input_par, output_par, nof_iterations, 1, long_cb); } + + diff --git a/lib/src/phy/fec/turbodecoder_avx.c b/lib/src/phy/fec/turbodecoder_avx.c new file mode 100644 index 000000000..82cceb5d2 --- /dev/null +++ b/lib/src/phy/fec/turbodecoder_avx.c @@ -0,0 +1,401 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "srslte/phy/fec/turbodecoder_simd.h" +#include "srslte/phy/utils/vector.h" + +#include + +#define NUMSTATES 8 +#define NINPUTS 2 +#define TAIL 3 +#define TOTALTAIL 12 + +#define INF 10000 +#define ZERO 0 + + +#ifdef LV_HAVE_AVX2 + +#include +#include + + +// Number of CB processed in parllel in AVX +#define NCB 2 + +/* +static void print_256i(__m256i x) { + int16_t *s = (int16_t*) &x; + printf("[%d", s[0]); + for (int i=1;i<16;i++) { + printf(",%d", s[i]); + } + printf("]\n"); +} +*/ + +/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */ +static inline int16_t hMax0(__m256i masked_value) +{ + __m128i tmp1 = _mm256_extractf128_si256(masked_value, 0); + __m128i tmp3 = _mm_minpos_epu16(tmp1); + return (int16_t)(_mm_cvtsi128_si32(tmp3)); +} + +static inline int16_t hMax1(__m256i masked_value) +{ + __m128i tmp1 = _mm256_extractf128_si256(masked_value, 1); + __m128i tmp3 = _mm_minpos_epu16(tmp1); + return (int16_t)(_mm_cvtsi128_si32(tmp3)); +} + +/* Computes beta values */ +void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t long_cb) +{ + int k; + uint32_t end = long_cb + 3; + const __m256i *alphaPtr = (const __m256i*) s->alpha; + + __m256i beta_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); + __m256i g, bp, bn, alpha_k; + + /* Define the shuffle constant for the positive beta */ + __m256i shuf_bp = _mm256_set_epi8( + // 1st CB + 15+16, 14+16, // 7 + 7+16, 6+16, // 3 + 5+16, 4+16, // 2 + 13+16, 12+16, // 6 + 11+16, 10+16, // 5 + 3+16, 2+16, // 1 + 1+16, 0+16, // 0 + 9+16, 8+16, // 4 + + // 2nd CB + 15, 14, // 7 + 7, 6, // 3 + 5, 4, // 2 + 13, 12, // 6 + 11, 10, // 5 + 3, 2, // 1 + 1, 0, // 0 + 9, 8 // 4 + ); + + /* Define the shuffle constant for the negative beta */ + __m256i shuf_bn = _mm256_set_epi8( + 7+16, 6+16, // 3 + 15+16, 14+16, // 7 + 13+16, 12+16, // 6 + 5+16, 4+16, // 2 + 3+16, 2+16, // 1 + 11+16, 10+16, // 5 + 9+16, 8+16, // 4 + 1+16, 0+16, // 0 + + 7, 6, // 3 + 15, 14, // 7 + 13, 12, // 6 + 5, 4, // 2 + 3, 2, // 1 + 11, 10, // 5 + 9, 8, // 4 + 1, 0 // 0 + ); + + alphaPtr += long_cb-1; + + /* Define shuffle for branch costs */ + __m256i shuf_g[4]; + shuf_g[3] = _mm256_set_epi8(3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16, + 3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); + shuf_g[2] = _mm256_set_epi8(7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16, + 7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); + shuf_g[1] = _mm256_set_epi8(11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16, + 11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); + shuf_g[0] = _mm256_set_epi8(15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16, + 15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); + + /* Define shuffle for beta normalization */ + __m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); + + __m256i gv; + int16_t *b = &s->branch[2*NCB*long_cb-16]; + __m256i *gPtr = (__m256i*) b; + + __m256i bn2, bp2; + + /* This defines a beta computation step: + * Adds and substracts the branch metrics to the previous beta step, + * shuffles the states according to the trellis path and selects maximum state + */ +#define BETA_STEP(g) bp = _mm256_add_epi16(beta_k, g);\ + bn = _mm256_sub_epi16(beta_k, g);\ + bp = _mm256_shuffle_epi8(bp, shuf_bp);\ + bn = _mm256_shuffle_epi8(bn, shuf_bn);\ + beta_k = _mm256_max_epi16(bp, bn); + + /* Loads the alpha metrics from memory and adds them to the temporal bn and bp + * metrics. Then computes horizontal maximum of both metrics and computes difference + */ +#define BETA_STEP_CNT(c,d) g = _mm256_shuffle_epi8(gv, shuf_g[c]);\ + BETA_STEP(g)\ + alpha_k = _mm256_load_si256(alphaPtr);\ + alphaPtr--;\ + bp = _mm256_add_epi16(bp, alpha_k);\ + bn = _mm256_add_epi16(bn, alpha_k);\ + bn2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bn);\ + bp2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bp);\ + output[0][k-d] = hMax0(bn2) - hMax0(bp2);\ + output[1][k-d] = hMax1(bn2) - hMax1(bp2); + + /* The tail does not require to load alpha or produce outputs. Only update + * beta metrics accordingly */ + for (k=end-1; k>=long_cb; k--) { + int16_t g0_1 = s->branch[2*NCB*k]; + int16_t g1_1 = s->branch[2*NCB*k+1]; + int16_t g0_2 = s->branch[2*NCB*k+6]; + int16_t g1_2 = s->branch[2*NCB*k+6+1]; + g = _mm256_set_epi16(g1_2, g0_2, g0_2, g1_2, g1_2, g0_2, g0_2, g1_2, g1_1, g0_1, g0_1, g1_1, g1_1, g0_1, g0_1, g1_1); + BETA_STEP(g); + } + + /* We inline 2 trelis steps for each normalization */ + __m256i norm; + for (; k >= 0; k-=8) { + gv = _mm256_load_si256(gPtr); + gPtr--; + BETA_STEP_CNT(0,0); + BETA_STEP_CNT(1,1); + BETA_STEP_CNT(2,2); + BETA_STEP_CNT(3,3); + norm = _mm256_shuffle_epi8(beta_k, shuf_norm); + beta_k = _mm256_sub_epi16(beta_k, norm); + gv = _mm256_load_si256(gPtr); + gPtr--; + BETA_STEP_CNT(0,4); + BETA_STEP_CNT(1,5); + BETA_STEP_CNT(2,6); + BETA_STEP_CNT(3,7); + norm = _mm256_shuffle_epi8(beta_k, shuf_norm); + beta_k = _mm256_sub_epi16(beta_k, norm); + } +} + +/* Computes alpha metrics */ +void map_avx_alpha(map_gen_t * s, uint32_t long_cb) +{ + uint32_t k; + int16_t *alpha1 = s->alpha; + int16_t *alpha2 = &s->alpha[8]; + uint32_t i; + + alpha1[0] = 0; + alpha2[0] = 0; + for (i = 1; i < 8; i++) { + alpha1[i] = -INF; + alpha2[i] = -INF; + } + + /* Define the shuffle constant for the positive alpha */ + __m256i shuf_ap = _mm256_set_epi8( + + // 1st CB + 31, 30, // 7 + 25, 24, // 4 + 23, 22, // 3 + 17, 16, // 0 + 29, 28, // 6 + 27, 26, // 5 + 21, 20, // 2 + 19, 18, // 1 + + // 2nd CB + 15, 14, // 7 + 9, 8, // 4 + 7, 6, // 3 + 1, 0, // 0 + 13, 12, // 6 + 11, 10, // 5 + 5, 4, // 2 + 3, 2 // 1 + ); + + /* Define the shuffle constant for the negative alpha */ + __m256i shuf_an = _mm256_set_epi8( + + // 1nd CB + 29, 28, // 6 + 27, 26, // 5 + 21, 20, // 2 + 19, 18, // 1 + 31, 30, // 7 + 25, 24, // 4 + 23, 22, // 3 + 17, 16, // 0 + + // 2nd CB + 13, 12, // 6 + 11, 10, // 5 + 5, 4, // 2 + 3, 2, // 1 + 15, 14, // 7 + 9, 8, // 4 + 7, 6, // 3 + 1, 0 // 0 + ); + + /* Define shuffle for branch costs */ + __m256i shuf_g[4]; + shuf_g[0] = _mm256_set_epi8(3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16, + 3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2); + shuf_g[1] = _mm256_set_epi8(7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16, + 7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6); + shuf_g[2] = _mm256_set_epi8(11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16, + 11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10); + shuf_g[3] = _mm256_set_epi8(15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16, + 15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14); + + __m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); + + __m256i* alphaPtr = (__m256i*) s->alpha; + alphaPtr++; + + __m256i gv; + __m256i *gPtr = (__m256i*) s->branch; + __m256i g, ap, an; + + __m256i alpha_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); + + /* This defines a alpha computation step: + * Adds and substracts the branch metrics to the previous alpha step, + * shuffles the states according to the trellis path and selects maximum state + */ +#define ALPHA_STEP(c) g = _mm256_shuffle_epi8(gv, shuf_g[c]); \ + ap = _mm256_add_epi16(alpha_k, g);\ + an = _mm256_sub_epi16(alpha_k, g);\ + ap = _mm256_shuffle_epi8(ap, shuf_ap);\ + an = _mm256_shuffle_epi8(an, shuf_an);\ + alpha_k = _mm256_max_epi16(ap, an);\ + _mm256_store_si256(alphaPtr, alpha_k);\ + alphaPtr++; \ + + + /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */ + __m256i norm; + for (k = 0; k < long_cb/8; k++) { + gv = _mm256_load_si256(gPtr); + + gPtr++; + ALPHA_STEP(0); + ALPHA_STEP(1); + ALPHA_STEP(2); + ALPHA_STEP(3); + norm = _mm256_shuffle_epi8(alpha_k, shuf_norm); + alpha_k = _mm256_sub_epi16(alpha_k, norm); + gv = _mm256_load_si256(gPtr); + gPtr++; + ALPHA_STEP(0); + ALPHA_STEP(1); + ALPHA_STEP(2); + ALPHA_STEP(3); + norm = _mm256_shuffle_epi8(alpha_k, shuf_norm); + alpha_k = _mm256_sub_epi16(alpha_k, norm); + } +} + +/* Compute branch metrics (gamma) */ +void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb) +{ + __m128i res10, res20, res11, res21, res1, res2; + __m128i in, ap, pa, g1, g0; + + __m128i *inPtr = (__m128i*) input; + __m128i *appPtr = (__m128i*) app; + __m128i *paPtr = (__m128i*) parity; + __m128i *resPtr = (__m128i*) h->branch; + + if (cbidx) { + resPtr++; + } + + __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); + __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); + __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); + __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); + + for (int i=0;ibranch[2*i*NCB+cbidx*6] = (input[i] - parity[i])/2; + h->branch[2*i*NCB+cbidx*6+1] = (input[i] + parity[i])/2; + } +} + + +#endif + + diff --git a/lib/src/phy/fec/turbodecoder_simd.c b/lib/src/phy/fec/turbodecoder_simd.c new file mode 100644 index 000000000..a2cddd582 --- /dev/null +++ b/lib/src/phy/fec/turbodecoder_simd.c @@ -0,0 +1,486 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "srslte/phy/fec/turbodecoder_simd.h" +#include "srslte/phy/utils/vector.h" + +#include + +#define NUMSTATES 8 +#define NINPUTS 2 +#define TAIL 3 +#define TOTALTAIL 12 + +#define INF 10000 +#define ZERO 0 + + +#ifdef LV_HAVE_SSE +#include + +// Define SSE/AVX implementations +void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb); +void map_sse_alpha(map_gen_t * s, uint32_t long_cb); +void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb); + +#ifdef LV_HAVE_AVX2 +void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t long_cb); +void map_avx_alpha(map_gen_t * s, uint32_t long_cb); +void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb); +#endif + + +void map_simd_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) +{ + if (nof_cb == 1) { + map_sse_beta(s, output[0], long_cb); + } +#ifdef LV_HAVE_AVX2 + else if (nof_cb == 2) { + map_avx_beta(s, output, long_cb); + } +#endif +} + +void map_simd_alpha(map_gen_t * s, uint32_t nof_cb, uint32_t long_cb) +{ + if (nof_cb == 1) { + map_sse_alpha(s, long_cb); + } +#ifdef LV_HAVE_AVX2 + else if (nof_cb == 2) { + map_avx_alpha(s, long_cb); + } +#endif +} +void map_simd_gamma(map_gen_t * s, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t nof_cb, uint32_t long_cb) +{ + if (nof_cb == 1) { + map_sse_gamma(s, input, app, parity, long_cb); + } +#ifdef LV_HAVE_AVX2 + else if (nof_cb == 2) { + map_avx_gamma(s, input, app, parity, cbidx, long_cb); + } +#endif +} + +/* Inititalizes constituent decoder object */ +int map_simd_init(map_gen_t * h, int max_long_cb) +{ + bzero(h, sizeof(map_gen_t)); + h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR); + if (!h->alpha) { + perror("srslte_vec_malloc"); + return -1; + } + h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR); + if (!h->branch) { + perror("srslte_vec_malloc"); + return -1; + } + h->max_long_cb = max_long_cb; + return 0; +} + +void map_simd_free(map_gen_t * h) +{ + if (h->alpha) { + free(h->alpha); + } + if (h->branch) { + free(h->branch); + } + bzero(h, sizeof(map_gen_t)); +} + +/* Runs one instance of a decoder */ +void map_simd_dec(map_gen_t * h, int16_t * input[SRSLTE_TDEC_NPAR], int16_t *app[SRSLTE_TDEC_NPAR], int16_t * parity[SRSLTE_TDEC_NPAR], + int16_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) +{ + + // Compute branch metrics + for (int i=0;imax_long_cb = max_long_cb; + + for (int i=0;iapp1[i] = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->app1[i]) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->app2[i] = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->app2[i]) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->ext1[i] = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->ext1[i]) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->ext2[i] = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->ext2[i]) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->syst[i] = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->syst[i]) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->parity0[i] = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->parity0[i]) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->parity1[i] = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->parity1[i]) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + + } + + if (map_simd_init(&h->dec, h->max_long_cb)) { + goto clean_and_exit; + } + + for (int i=0;iinterleaver[i], srslte_cbsegm_cbsize(i)) < 0) { + goto clean_and_exit; + } + srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i)); + } + h->current_cbidx = -1; + ret = 0; +clean_and_exit:if (ret == -1) { + srslte_tdec_simd_free(h); + } + return ret; +} + +void srslte_tdec_simd_free(srslte_tdec_simd_t * h) +{ + for (int i=0;iapp1[i]) { + free(h->app1[i]); + } + if (h->app2[i]) { + free(h->app2[i]); + } + if (h->ext1[i]) { + free(h->ext1[i]); + } + if (h->ext2[i]) { + free(h->ext2[i]); + } + if (h->syst[i]) { + free(h->syst[i]); + } + if (h->parity0[i]) { + free(h->parity0[i]); + } + if (h->parity1[i]) { + free(h->parity1[i]); + } + } + + map_simd_free(&h->dec); + + for (int i=0;iinterleaver[i]); + } + + bzero(h, sizeof(srslte_tdec_simd_t)); +} + +/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into + * 3 buffers ready to be used by compute_gamma() + */ +void deinterleave_input_simd(srslte_tdec_simd_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb) { + uint32_t i; + + __m128i *inputPtr = (__m128i*) input; + __m128i in0, in1, in2; + __m128i s0, s1, s2, s; + __m128i p00, p01, p02, p0; + __m128i p10, p11, p12, p1; + + __m128i *sysPtr = (__m128i*) h->syst[cbidx]; + __m128i *pa0Ptr = (__m128i*) h->parity0[cbidx]; + __m128i *pa1Ptr = (__m128i*) h->parity1[cbidx]; + + // pick bits 0, 3, 6 from 1st word + __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0); + // pick bits 1, 4, 7 from 2st word + __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff); + // pick bits 2, 5 from 3rd word + __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + // pick bits 1, 4, 7 from 1st word + __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2); + // pick bits 2, 5, from 2st word + __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff); + // pick bits 0, 3, 6 from 3rd word + __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + // pick bits 2, 5 from 1st word + __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4); + // pick bits 0, 3, 6, from 2st word + __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff); + // pick bits 1, 4, 7 from 3rd word + __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + // Split systematic and parity bits + for (i = 0; i < long_cb/8; i++) { + + in0 = _mm_load_si128(inputPtr); inputPtr++; + in1 = _mm_load_si128(inputPtr); inputPtr++; + in2 = _mm_load_si128(inputPtr); inputPtr++; + + /* Deinterleave Systematic bits */ + s0 = _mm_shuffle_epi8(in0, s0_mask); + s1 = _mm_shuffle_epi8(in1, s1_mask); + s2 = _mm_shuffle_epi8(in2, s2_mask); + s = _mm_or_si128(s0, s1); + s = _mm_or_si128(s, s2); + + _mm_store_si128(sysPtr, s); + sysPtr++; + + /* Deinterleave parity 0 bits */ + p00 = _mm_shuffle_epi8(in0, p00_mask); + p01 = _mm_shuffle_epi8(in1, p01_mask); + p02 = _mm_shuffle_epi8(in2, p02_mask); + p0 = _mm_or_si128(p00, p01); + p0 = _mm_or_si128(p0, p02); + + _mm_store_si128(pa0Ptr, p0); + pa0Ptr++; + + /* Deinterleave parity 1 bits */ + p10 = _mm_shuffle_epi8(in0, p10_mask); + p11 = _mm_shuffle_epi8(in1, p11_mask); + p12 = _mm_shuffle_epi8(in2, p12_mask); + p1 = _mm_or_si128(p10, p11); + p1 = _mm_or_si128(p1, p12); + + _mm_store_si128(pa1Ptr, p1); + pa1Ptr++; + + } + + for (i = 0; i < 3; i++) { + h->syst[cbidx][i+long_cb] = input[3*long_cb + 2*i]; + h->parity0[cbidx][i+long_cb] = input[3*long_cb + 2*i + 1]; + } + for (i = 0; i < 3; i++) { + h->app2[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i]; + h->parity1[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i + 1]; + } + +} + +/* Runs 1 turbo decoder iteration */ +void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) +{ + + if (h->current_cbidx >= 0) { + uint16_t *inter = h->interleaver[h->current_cbidx].forward; + uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; + + if (h->n_iter == 0) { + for (int i=0;in_iter > 0) { + for (int i=0;iapp1[i], h->ext1[i], h->app1[i], long_cb); + } + } + + // Run MAP DEC #1 + if (h->n_iter == 0) { + map_simd_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, nof_cb, long_cb); + } else { + map_simd_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, nof_cb, long_cb); + } + + // Convert aposteriori information into extrinsic information + if (h->n_iter > 0) { + for (int i=0;iext1[i], h->app1[i], h->ext1[i], long_cb); + } + } + + // Interleave extrinsic output of DEC1 to form apriori info for decoder 2 + for (int i=0;iext1[i], deinter, h->app2[i], long_cb); + } + + // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits + map_simd_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, nof_cb, long_cb); + + // Deinterleaved extrinsic bits become apriori info for decoder 1 + for (int i=0;iext2[i], inter, h->app1[i], long_cb); + } + + h->n_iter++; + } else { + fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_reset() first\n"); + } +} + +/* Resets the decoder and sets the codeblock length */ +int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb) +{ + if (long_cb > h->max_long_cb) { + fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n", + h->max_long_cb); + return -1; + } + h->n_iter = 0; + h->current_cbidx = srslte_cbsegm_cbindex(long_cb); + if (h->current_cbidx < 0) { + fprintf(stderr, "Invalid CB length %d\n", long_cb); + return -1; + } + return 0; +} + +void tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb) +{ + __m128i zero = _mm_set1_epi16(0); + __m128i lsb_mask = _mm_set1_epi16(1); + + __m128i *appPtr = (__m128i*) h->app1[cbidx]; + __m128i *outPtr = (__m128i*) output; + __m128i ap, out, out0, out1; + + for (uint32_t i = 0; i < long_cb/16; i++) { + ap = _mm_load_si128(appPtr); appPtr++; + out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); + ap = _mm_load_si128(appPtr); appPtr++; + out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); + + out = _mm_packs_epi16(out0, out1); + _mm_store_si128(outPtr, out); + outPtr++; + } + if (long_cb%16) { + for (int i=0;i<8;i++) { + output[long_cb-8+i] = h->app1[cbidx][long_cb-8+i]>0?1:0; + } + } +} + +void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) +{ + for (int i=0;iapp1[cbidx][8*i+0]>0?mask[0]:0; + uint8_t out1 = h->app1[cbidx][8*i+1]>0?mask[1]:0; + uint8_t out2 = h->app1[cbidx][8*i+2]>0?mask[2]:0; + uint8_t out3 = h->app1[cbidx][8*i+3]>0?mask[3]:0; + uint8_t out4 = h->app1[cbidx][8*i+4]>0?mask[4]:0; + uint8_t out5 = h->app1[cbidx][8*i+5]>0?mask[5]:0; + uint8_t out6 = h->app1[cbidx][8*i+6]>0?mask[6]:0; + uint8_t out7 = h->app1[cbidx][8*i+7]>0?mask[7]:0; + + output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; + + //if (i<10) { + // printf("output[%d]=%d\n",i,output[i]); + //} + } +} + +void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) +{ + for (int i=0;in_iter < nof_iterations); + + srslte_tdec_simd_decision_byte(h, output, nof_cb, long_cb); + + return SRSLTE_SUCCESS; +} + +#endif + + diff --git a/lib/src/phy/fec/turbodecoder_sse.c b/lib/src/phy/fec/turbodecoder_sse.c index 91d96c287..4e92efcc0 100644 --- a/lib/src/phy/fec/turbodecoder_sse.c +++ b/lib/src/phy/fec/turbodecoder_sse.c @@ -52,6 +52,17 @@ #ifdef LV_HAVE_SSE +/* +static void print_128i(__m128i x) { + int16_t *s = (int16_t*) &x; + printf("[%d", s[0]); + for (int i=1;i<8;i++) { + printf(",%d", s[i]); + } + printf("]\n"); +} +*/ + /* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */ static inline int16_t hMax(__m128i buffer) { @@ -126,7 +137,8 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) alpha_k = _mm_load_si128(alphaPtr);\ alphaPtr--;\ bp = _mm_add_epi16(bp, alpha_k);\ - bn = _mm_add_epi16(bn, alpha_k); output[k-d] = hMax(bn) - hMax(bp); + bn = _mm_add_epi16(bn, alpha_k);\ + output[k-d] = hMax(bn) - hMax(bp); /* The tail does not require to load alpha or produce outputs. Only update * beta metrics accordingly */ @@ -134,7 +146,6 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) int16_t g0 = s->branch[2*k]; int16_t g1 = s->branch[2*k+1]; g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1); - BETA_STEP(g); }