diff --git a/CMakeLists.txt b/CMakeLists.txt index 0788ec335..42d8996a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -279,8 +279,8 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ggdb -DBUILD_TYPE_RELWITHDEBINFO") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -DBUILD_TYPE_RELWITHDEBINFO") else(${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -DBUILD_TYPE_RELEASE") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -DBUILD_TYPE_RELEASE") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fno-trapping-math -fno-math-errno -DBUILD_TYPE_RELEASE") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fno-trapping-math -fno-math-errno -DBUILD_TYPE_RELEASE") endif(${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") endif(${CMAKE_BUILD_TYPE} STREQUAL "Debug") @@ -325,9 +325,9 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") set(CMAKE_REQUIRED_FLAGS ${CMAKE_C_FLAGS}) - if(NOT HAVE_SSE AND NOT HAVE_NEON AND NOT DISABLE_SIMD) - message(FATAL_ERROR "no SIMD instructions found") - endif(NOT HAVE_SSE AND NOT HAVE_NEON AND NOT DISABLE_SIMD) + if(NOT HAVE_SSE AND NOT HAVE_NEON AND NOT DISABLE_SIMD) + message(FATAL_ERROR "no SIMD instructions found") + endif(NOT HAVE_SSE AND NOT HAVE_NEON AND NOT DISABLE_SIMD) if(NOT WIN32) ADD_CXX_COMPILER_FLAG_IF_AVAILABLE(-fvisibility=hidden HAVE_VISIBILITY_HIDDEN) diff --git a/lib/include/srslte/config.h b/lib/include/srslte/config.h index 02c43777f..8ba918d20 100644 --- a/lib/include/srslte/config.h +++ b/lib/include/srslte/config.h @@ -52,6 +52,14 @@ #endif +// Useful macros for templates +#define CONCAT(a, b) a##b +#define CONCAT2(a, b) CONCAT(a,b) + +#define STRING2(x) #x +#define STRING(x) STRING2(x) + + // Common error codes #define SRSLTE_SUCCESS 0 #define SRSLTE_ERROR -1 diff --git a/lib/include/srslte/interfaces/ue_interfaces.h b/lib/include/srslte/interfaces/ue_interfaces.h index 959635afc..55ed29e07 100644 --- a/lib/include/srslte/interfaces/ue_interfaces.h +++ b/lib/include/srslte/interfaces/ue_interfaces.h @@ -545,6 +545,7 @@ typedef struct { bool sic_pss_enabled; float rx_gain_offset; bool pdsch_csi_enabled; + bool pdsch_8bit_decoder; uint32_t intra_freq_meas_len_ms; uint32_t intra_freq_meas_period_ms; } phy_args_t; diff --git a/lib/include/srslte/phy/common/sequence.h b/lib/include/srslte/phy/common/sequence.h index 346be1cda..96d18c008 100644 --- a/lib/include/srslte/phy/common/sequence.h +++ b/lib/include/srslte/phy/common/sequence.h @@ -44,6 +44,7 @@ typedef struct SRSLTE_API { uint8_t *c_bytes; float *c_float; short *c_short; + int8_t *c_char; uint32_t cur_len; uint32_t max_len; } srslte_sequence_t; diff --git a/lib/include/srslte/phy/fec/rm_turbo.h b/lib/include/srslte/phy/fec/rm_turbo.h index 26f98944c..51519ae72 100644 --- a/lib/include/srslte/phy/fec/rm_turbo.h +++ b/lib/include/srslte/phy/fec/rm_turbo.h @@ -36,6 +36,7 @@ #define SRSLTE_RM_TURBO_H #include "srslte/config.h" +#include "srslte/phy/fec/turbodecoder.h" #ifndef SRSLTE_RX_NULL #define SRSLTE_RX_NULL 10000 @@ -47,7 +48,6 @@ #include "srslte/config.h" - SRSLTE_API int srslte_rm_turbo_tx(uint8_t *w_buff, uint32_t buff_len, uint8_t *input, @@ -82,7 +82,19 @@ SRSLTE_API int srslte_rm_turbo_rx_lut(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, - uint32_t rv_idx); + uint32_t rv_idx); + +SRSLTE_API int srslte_rm_turbo_rx_lut_(int16_t *input, + int16_t *output, + uint32_t in_len, + uint32_t cb_idx, + uint32_t rv_idx, + bool enable_input_tdec); +SRSLTE_API int srslte_rm_turbo_rx_lut_8bit(int8_t *input, + int8_t *output, + uint32_t in_len, + uint32_t cb_idx, + uint32_t rv_idx); #endif // SRSLTE_RM_TURBO_H diff --git a/lib/include/srslte/phy/fec/tc_interl.h b/lib/include/srslte/phy/fec/tc_interl.h index c0ffaae58..6cb1ad4c9 100644 --- a/lib/include/srslte/phy/fec/tc_interl.h +++ b/lib/include/srslte/phy/fec/tc_interl.h @@ -1,58 +1,59 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -/********************************************************************************************** - * File: tc_interl.h - * - * Description: Turbo code interleaver. - * - * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2.3 - *********************************************************************************************/ - -#ifndef SRSLTE_TC_INTERL_H -#define SRSLTE_TC_INTERL_H - -#include "srslte/config.h" -#include - -typedef struct SRSLTE_API { - uint16_t *forward; - uint16_t *reverse; - uint32_t max_long_cb; -} srslte_tc_interl_t; - -SRSLTE_API int srslte_tc_interl_LTE_gen(srslte_tc_interl_t *h, - uint32_t long_cb); - -SRSLTE_API int srslte_tc_interl_UMTS_gen(srslte_tc_interl_t *h, - uint32_t long_cb); - -SRSLTE_API int srslte_tc_interl_init(srslte_tc_interl_t *h, - uint32_t max_long_cb); - -SRSLTE_API void srslte_tc_interl_free(srslte_tc_interl_t *h); - -#endif // SRSLTE_TC_INTERL_H +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +/********************************************************************************************** + * File: tc_interl.h + * + * Description: Turbo code interleaver. + * + * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2.3 + *********************************************************************************************/ + +#ifndef SRSLTE_TC_INTERL_H +#define SRSLTE_TC_INTERL_H + +#include "srslte/config.h" +#include + +typedef struct SRSLTE_API { + uint16_t *forward; + uint16_t *reverse; + uint32_t max_long_cb; +} srslte_tc_interl_t; + +SRSLTE_API int srslte_tc_interl_LTE_gen(srslte_tc_interl_t *h, + uint32_t long_cb); + +SRSLTE_API int srslte_tc_interl_LTE_gen_interl(srslte_tc_interl_t *h, + uint32_t long_cb, + uint32_t interl_win); + +SRSLTE_API int srslte_tc_interl_init(srslte_tc_interl_t *h, + uint32_t max_long_cb); + +SRSLTE_API void srslte_tc_interl_free(srslte_tc_interl_t *h); + +#endif // SRSLTE_TC_INTERL_H diff --git a/lib/include/srslte/phy/fec/turbocoder.h b/lib/include/srslte/phy/fec/turbocoder.h index 4da697461..84abf5b28 100644 --- a/lib/include/srslte/phy/fec/turbocoder.h +++ b/lib/include/srslte/phy/fec/turbocoder.h @@ -70,10 +70,12 @@ SRSLTE_API int srslte_tcod_encode(srslte_tcod_t *h, uint32_t long_cb); SRSLTE_API int srslte_tcod_encode_lut(srslte_tcod_t *h, - srslte_crc_t *crc, - uint8_t *input, + srslte_crc_t *crc_tb, + srslte_crc_t *crc_cb, + uint8_t *input, uint8_t *parity, - uint32_t cblen_idx); + uint32_t cblen_idx, + bool last_cb); SRSLTE_API void srslte_tcod_gentable(); diff --git a/lib/include/srslte/phy/fec/turbodecoder.h b/lib/include/srslte/phy/fec/turbodecoder.h index d882118aa..4c4de0082 100644 --- a/lib/include/srslte/phy/fec/turbodecoder.h +++ b/lib/include/srslte/phy/fec/turbodecoder.h @@ -1,120 +1,143 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -/********************************************************************************************** - * File: turbodecoder.h - * - * Description: Turbo Decoder. - * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent - * encoders and one turbo code internal interleaver. The coding rate of turbo - * encoder is 1/3. - * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. - * - * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 - *********************************************************************************************/ - -#ifndef SRSLTE_TURBODECODER_H -#define SRSLTE_TURBODECODER_H - -#include "srslte/config.h" -#include "srslte/phy/fec/tc_interl.h" -#include "srslte/phy/fec/cbsegm.h" - -#define SRSLTE_TCOD_RATE 3 -#define SRSLTE_TCOD_TOTALTAIL 12 - -#define SRSLTE_TCOD_MAX_LEN_CB 6144 -#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL) - -#include "srslte/phy/fec/turbodecoder_gen.h" -#include "srslte/phy/fec/turbodecoder_simd.h" - -typedef struct SRSLTE_API { - float *input_conv; - union { - srslte_tdec_simd_t tdec_simd; - srslte_tdec_gen_t tdec_gen; - }; -} srslte_tdec_t; - -SRSLTE_API int srslte_tdec_init(srslte_tdec_t * h, - uint32_t max_long_cb); - -SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h); - -SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, - uint32_t long_cb); - -SRSLTE_API int srslte_tdec_reset_cb(srslte_tdec_t * h, - uint32_t cb_idx); - -SRSLTE_API int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h, - uint32_t cb_idx); - -SRSLTE_API uint32_t srslte_tdec_get_nof_parallel(srslte_tdec_t * h); - -SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h, - int16_t* input, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_decision(srslte_tdec_t * h, - uint8_t *output, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_decision_byte(srslte_tdec_t * h, - uint8_t *output, - uint32_t long_cb); - -SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h, - int16_t * input, - uint8_t *output, - uint32_t nof_iterations, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_iteration_par(srslte_tdec_t * h, - int16_t* input[SRSLTE_TDEC_MAX_NPAR], - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_decision_par(srslte_tdec_t * h, - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_decision_byte_par(srslte_tdec_t * h, - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_decision_byte_par_cb(srslte_tdec_t * h, - uint8_t *output, - uint32_t cb_idx, - uint32_t long_cb); - -SRSLTE_API int srslte_tdec_run_all_par(srslte_tdec_t * h, - int16_t * input[SRSLTE_TDEC_MAX_NPAR], - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t nof_iterations, - uint32_t long_cb); - -#endif // SRSLTE_TURBODECODER_H +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +/********************************************************************************************** + * File: turbodecoder.h + * + * Description: Turbo Decoder. + * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent + * encoders and one turbo code internal interleaver. The coding rate of turbo + * encoder is 1/3. + * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. + * + * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 + *********************************************************************************************/ + +#ifndef SRSLTE_TURBODECODER_H +#define SRSLTE_TURBODECODER_H + +#include "srslte/config.h" +#include "srslte/phy/fec/tc_interl.h" +#include "srslte/phy/fec/cbsegm.h" + +#define SRSLTE_TCOD_RATE 3 +#define SRSLTE_TCOD_TOTALTAIL 12 + +#define SRSLTE_TCOD_MAX_LEN_CB 6144 + +// Expect the input to be aligned for sub-block window processing. +#define SRSLTE_TDEC_EXPECT_INPUT_SB 1 + +// Include interfaces for 8 and 16 bit decoder implementations +#define LLR_IS_8BIT +#include "srslte/phy/fec/turbodecoder_impl.h" +#undef LLR_IS_8BIT + +#define LLR_IS_16BIT +#include "srslte/phy/fec/turbodecoder_impl.h" +#undef LLR_IS_16BIT + +#define SRSLTE_TDEC_NOF_AUTO_MODES_8 2 +#define SRSLTE_TDEC_NOF_AUTO_MODES_16 3 + +typedef enum {SRSLTE_TDEC_8, SRSLTE_TDEC_16} srslte_tdec_llr_type_t; + +typedef struct SRSLTE_API { + uint32_t max_long_cb; + + void *dec8_hdlr[SRSLTE_TDEC_NOF_AUTO_MODES_8]; + void *dec16_hdlr[SRSLTE_TDEC_NOF_AUTO_MODES_16]; + srslte_tdec_8bit_impl_t *dec8[SRSLTE_TDEC_NOF_AUTO_MODES_8]; + srslte_tdec_16bit_impl_t *dec16[SRSLTE_TDEC_NOF_AUTO_MODES_16]; + int nof_blocks8[SRSLTE_TDEC_NOF_AUTO_MODES_8]; + int nof_blocks16[SRSLTE_TDEC_NOF_AUTO_MODES_16]; + + // Declare as void types as can be int8 or int16 + void *app1; + void *app2; + void *ext1; + void *ext2; + void *syst0; + void *parity0; + void *parity1; + + void *input_conv; + + bool force_not_sb; + + srslte_tdec_impl_type_t dec_type; + + srslte_tdec_llr_type_t current_llr_type; + uint32_t current_dec; + uint32_t current_long_cb; + uint32_t current_inter_idx; + int current_cbidx; + srslte_tc_interl_t interleaver[4][SRSLTE_NOF_TC_CB_SIZES]; + int n_iter; +} srslte_tdec_t; + +SRSLTE_API int srslte_tdec_init(srslte_tdec_t * h, + uint32_t max_long_cb); + +SRSLTE_API int srslte_tdec_init_manual(srslte_tdec_t * h, + uint32_t max_long_cb, + srslte_tdec_impl_type_t dec_type); + +SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h); + +SRSLTE_API void srslte_tdec_force_not_sb(srslte_tdec_t *h); + +SRSLTE_API int srslte_tdec_new_cb(srslte_tdec_t * h, + uint32_t long_cb); + +SRSLTE_API int srslte_tdec_get_nof_iterations(srslte_tdec_t * h); + +SRSLTE_API uint32_t srslte_tdec_autoimp_get_subblocks(uint32_t long_cb); + +SRSLTE_API uint32_t srslte_tdec_autoimp_get_subblocks_8bit(uint32_t long_cb); + +SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h, + int16_t* input, + uint8_t *output); + +SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h, + int16_t * input, + uint8_t *output, + uint32_t nof_iterations, + uint32_t long_cb); + +SRSLTE_API void srslte_tdec_iteration_8bit(srslte_tdec_t * h, + int8_t* input, + uint8_t *output); + +SRSLTE_API int srslte_tdec_run_all_8bit(srslte_tdec_t * h, + int8_t * input, + uint8_t *output, + uint32_t nof_iterations, + uint32_t long_cb); + + +#endif // SRSLTE_TURBODECODER_H diff --git a/lib/include/srslte/phy/fec/turbodecoder_gen.h b/lib/include/srslte/phy/fec/turbodecoder_gen.h index 2fefc22cc..4d61c5e23 100644 --- a/lib/include/srslte/phy/fec/turbodecoder_gen.h +++ b/lib/include/srslte/phy/fec/turbodecoder_gen.h @@ -1,99 +1,62 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -/********************************************************************************************** - * File: turbodecoder.h - * - * Description: Turbo Decoder. - * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent - * encoders and one turbo code internal interleaver. The coding rate of turbo - * encoder is 1/3. - * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. - * - * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 - *********************************************************************************************/ - -#ifndef SRSLTE_TURBODECODER_GEN_H -#define SRSLTE_TURBODECODER_GEN_H - -#include "srslte/config.h" -#include "srslte/phy/fec/tc_interl.h" -#include "srslte/phy/fec/cbsegm.h" - -#define SRSLTE_TCOD_RATE 3 -#define SRSLTE_TCOD_TOTALTAIL 12 - -#define SRSLTE_TCOD_MAX_LEN_CB 6144 -#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL) - -typedef struct SRSLTE_API { - int max_long_cb; - float *beta; -} srslte_map_gen_vl_t; - -typedef struct SRSLTE_API { - int max_long_cb; - - srslte_map_gen_vl_t dec; - - float *llr1; - float *llr2; - float *w; - float *syst; - float *parity; - - int current_cbidx; - uint32_t current_cb_len; - uint32_t n_iter; - srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES]; -} srslte_tdec_gen_t; - -SRSLTE_API int srslte_tdec_gen_init(srslte_tdec_gen_t * h, - uint32_t max_long_cb); - -SRSLTE_API void srslte_tdec_gen_free(srslte_tdec_gen_t * h); - -SRSLTE_API int srslte_tdec_gen_reset(srslte_tdec_gen_t * h, uint32_t long_cb); - -SRSLTE_API void srslte_tdec_gen_iteration(srslte_tdec_gen_t * h, - float * input, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_gen_decision(srslte_tdec_gen_t * h, - uint8_t *output, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_gen_decision_byte(srslte_tdec_gen_t * h, - uint8_t *output, - uint32_t long_cb); - -SRSLTE_API int srslte_tdec_gen_run_all(srslte_tdec_gen_t * h, - float * input, - uint8_t *output, - uint32_t nof_iterations, - uint32_t long_cb); - -#endif // SRSLTE_TURBODECODER_GEN_H +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +/********************************************************************************************** + * File: turbodecoder.h + * + * Description: Turbo Decoder. + * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent + * encoders and one turbo code internal interleaver. The coding rate of turbo + * encoder is 1/3. + * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. + * + * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 + *********************************************************************************************/ + +#ifndef SRSLTE_TURBODECODER_GEN_H +#define SRSLTE_TURBODECODER_GEN_H + +#include "srslte/config.h" +#include "srslte/phy/fec/tc_interl.h" +#include "srslte/phy/fec/cbsegm.h" + +#define SRSLTE_TCOD_RATE 3 +#define SRSLTE_TCOD_TOTALTAIL 12 + +#define SRSLTE_TCOD_MAX_LEN_CB 6144 + +typedef struct SRSLTE_API { + uint32_t max_long_cb; + int16_t *beta; +} tdec_gen_t; + +int tdec_gen_init(void **h, uint32_t max_long_cb); +void tdec_gen_free(void *h); +void tdec_gen_dec(void *h, int16_t * input, int16_t *app, int16_t * parity, int16_t *output, uint32_t long_cb); +void tdec_gen_extract_input(int16_t *input, int16_t *syst, int16_t *parity0, int16_t *parity1, int16_t *app2, uint32_t long_cb); +void tdec_gen_decision_byte(int16_t *app1, uint8_t *output, uint32_t long_cb); + +#endif // SRSLTE_TURBODECODER_GEN_H diff --git a/lib/include/srslte/phy/fec/turbodecoder_impl.h b/lib/include/srslte/phy/fec/turbodecoder_impl.h new file mode 100644 index 000000000..1fe0a5321 --- /dev/null +++ b/lib/include/srslte/phy/fec/turbodecoder_impl.h @@ -0,0 +1,68 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#ifndef SRSLTE_TURBODECODER_IMPL_H +#define SRSLTE_TURBODECODER_IMPL_H + +#include "srslte/config.h" + +/* Interface for internal decoder implementation */ +typedef enum SRSLTE_API { + SRSLTE_TDEC_AUTO = 0, + SRSLTE_TDEC_GENERIC, + SRSLTE_TDEC_SSE, + SRSLTE_TDEC_SSE_WINDOW, + SRSLTE_TDEC_AVX_WINDOW, + SRSLTE_TDEC_SSE8_WINDOW, + SRSLTE_TDEC_AVX8_WINDOW, + SRSLTE_TDEC_NOF_IMP +} srslte_tdec_impl_type_t; + +#endif + +#ifdef LLR_IS_8BIT +#define llr_t int8_t +#define type_name srslte_tdec_8bit_impl_t +#else + #ifdef LLR_IS_16BIT + #define llr_t int16_t +#define type_name srslte_tdec_16bit_impl_t + #else + #error "Unsupported LLR mode" + #endif +#endif + + +typedef struct SRSLTE_API { + int (*tdec_init)(void **h, uint32_t max_long_cb); + void (*tdec_free)(void *h); + void (*tdec_dec)(void *h, llr_t * input, llr_t *app, llr_t * parity, llr_t *output, uint32_t long_cb); + void (*tdec_extract_input)(llr_t *input, llr_t *syst, llr_t *parity0, llr_t *parity1, llr_t *app2, uint32_t long_cb); + void (*tdec_decision_byte)(llr_t *app1, uint8_t *output, uint32_t long_cb); +} type_name; + +#undef llr_t +#undef type_name \ No newline at end of file diff --git a/lib/include/srslte/phy/fec/turbodecoder_iter.h b/lib/include/srslte/phy/fec/turbodecoder_iter.h new file mode 100644 index 000000000..7864829b5 --- /dev/null +++ b/lib/include/srslte/phy/fec/turbodecoder_iter.h @@ -0,0 +1,158 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include "srslte/config.h" + +#define MAKE_CALL(a) CONCAT2(a,type_name) +#define MAKE_VEC(a) CONCAT2(a,vec_suffix) +#define PRINT CONCAT2(srslte_vec_fprint,print_suffix) + +#ifdef LLR_IS_8BIT +#define llr_t int8_t +#define type_name _8bit +#define vec_suffix _bbb +#define print_suffix _bs +#define decptr h->dec8[h->current_dec] +#define dechdlr h->dec8_hdlr[h->current_dec] +#define input_is_interleaved 1 +#else +#ifdef LLR_IS_16BIT + #define llr_t int16_t + #define vec_suffix _sss + #define print_suffix _s + #define decptr h->dec16[h->current_dec] + #define dechdlr h->dec16_hdlr[h->current_dec] + #define input_is_interleaved (h->current_dec > 0) +#define type_name _16bit + #else + #warning "Unsupported LLR mode" + #endif +#endif + +#define debug_enabled_iter 0 +#define debug_len 20 + +#define debug_vec(a) if (debug_enabled_iter) {printf("%s it=%d: ", STRING(a), n_iter);PRINT(stdout, a, debug_len);} + + +static void MAKE_CALL(extract_input_tail_sb)(llr_t *input, llr_t *syst, llr_t *app2, llr_t *parity0, llr_t *parity1, uint32_t long_cb) +{ + for (int i = long_cb; i < long_cb + 3; i++) { + syst[i] = input[3*(long_cb+32) + 2*(i - long_cb)]; + parity0[i] = input[3*(long_cb+32)+ 2*(i - long_cb) + 1]; + + app2[i] = input[3*(long_cb+32) + 6 + 2*(i - long_cb)]; + parity1[i] = input[3*(long_cb+32) + 6 + 2*(i - long_cb) + 1]; + } +} + +/* Runs 1 turbo decoder iteration */ +void MAKE_CALL(run_tdec_iteration)(srslte_tdec_t * h, llr_t * input) +{ + + if (h->current_cbidx >= 0) { + uint16_t *inter = h->interleaver[h->current_inter_idx][h->current_cbidx].forward; + uint16_t *deinter = h->interleaver[h->current_inter_idx][h->current_cbidx].reverse; + llr_t *syst = (llr_t*) h->syst0; + llr_t *parity0 = (llr_t*) h->parity0; + llr_t *parity1 = (llr_t*) h->parity1; + + llr_t *app1 = (llr_t*) h->app1; + llr_t *app2 = (llr_t*) h->app2; + llr_t *ext1 = (llr_t*) h->ext1; + llr_t *ext2 = (llr_t*) h->ext2; + + uint32_t long_cb = h->current_long_cb; + uint32_t n_iter = h->n_iter; + + if (SRSLTE_TDEC_EXPECT_INPUT_SB && !h->force_not_sb && input_is_interleaved) { + syst = input; + // align to 32 bytes (warning: must be same alignment as in rm_turbo.c) + parity0 = &input[long_cb+32]; + parity1 = &input[2*(long_cb+32)]; + if (n_iter == 0) { + MAKE_CALL(extract_input_tail_sb)(input, syst, app2, parity0, parity1, long_cb); + } + } else { + if (n_iter == 0) { + decptr->tdec_extract_input(input, syst, app2, parity0, parity1, long_cb); + } + } + + if ((n_iter%2) == 0) { + + // Add apriori information to decoder 1 + if (n_iter) { + MAKE_VEC(srslte_vec_sub)(app1, ext1, app1, long_cb); + } + + // Run MAP DEC #1 + decptr->tdec_dec(dechdlr, syst, n_iter ? app1 : NULL, parity0, ext1, long_cb); + + } + // Interleave extrinsic output of DEC1 to form apriori info for decoder 2 + if (n_iter%2) { + // Convert aposteriori information into extrinsic information + if (n_iter > 1) { + MAKE_VEC(srslte_vec_sub)(ext1, app1, ext1, long_cb); + } + + MAKE_VEC(srslte_vec_lut)(ext1, deinter, app2, long_cb); + + // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits + decptr->tdec_dec(dechdlr, app2, NULL, parity1, ext2, long_cb); + + // Deinterleaved extrinsic bits become apriori info for decoder 1 + MAKE_VEC(srslte_vec_lut)(ext2, inter, app1, long_cb); + + } + + if (h->n_iter == 0) { + debug_vec(syst); + debug_vec(parity0); + debug_vec(parity1); + } + debug_vec(ext1); + debug_vec(ext2); + debug_vec(app1); + debug_vec(app2); + + h->n_iter++; + } else { + fprintf(stderr, "Error CB index not set (call srslte_tdec_new_cb() first\n"); + } +} + +#undef debug_enabled +#undef debug_len +#undef debug_vec +#undef llr_t +#undef vec_suffix +#undef print_suffix +#undef decptr +#undef dechdlr +#undef type_name +#undef input_is_interleaved \ No newline at end of file diff --git a/lib/include/srslte/phy/fec/turbodecoder_simd.h b/lib/include/srslte/phy/fec/turbodecoder_simd.h deleted file mode 100644 index 35c8beef9..000000000 --- a/lib/include/srslte/phy/fec/turbodecoder_simd.h +++ /dev/null @@ -1,122 +0,0 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -/********************************************************************************************** - * File: turbodecoder.h - * - * Description: Turbo Decoder. - * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent - * encoders and one turbo code internal interleaver. The coding rate of turbo - * encoder is 1/3. - * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. - * - * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 - *********************************************************************************************/ - -#ifndef SRSLTE_TURBODECODER_SIMD_H -#define SRSLTE_TURBODECODER_SIMD_H - -#include "srslte/config.h" -#include "srslte/phy/fec/tc_interl.h" -#include "srslte/phy/fec/cbsegm.h" - -// Define maximum number of CB decoded in parallel (2 for AVX2) -#define SRSLTE_TDEC_MAX_NPAR 2 - -#define SRSLTE_TCOD_RATE 3 -#define SRSLTE_TCOD_TOTALTAIL 12 - -#define SRSLTE_TCOD_MAX_LEN_CB 6144 -#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL) - -typedef struct SRSLTE_API { - uint32_t max_long_cb; - uint32_t max_par_cb; - int16_t *alpha; - int16_t *branch; -} map_gen_t; - -typedef struct SRSLTE_API { - uint32_t max_long_cb; - uint32_t max_par_cb; - - map_gen_t dec; - - int16_t *app1[SRSLTE_TDEC_MAX_NPAR]; - int16_t *app2[SRSLTE_TDEC_MAX_NPAR]; - int16_t *ext1[SRSLTE_TDEC_MAX_NPAR]; - int16_t *ext2[SRSLTE_TDEC_MAX_NPAR]; - int16_t *syst[SRSLTE_TDEC_MAX_NPAR]; - int16_t *parity0[SRSLTE_TDEC_MAX_NPAR]; - int16_t *parity1[SRSLTE_TDEC_MAX_NPAR]; - - int cb_mask; - int current_cbidx; - srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES]; - int n_iter[SRSLTE_TDEC_MAX_NPAR]; -} srslte_tdec_simd_t; - -SRSLTE_API int srslte_tdec_simd_init(srslte_tdec_simd_t * h, - uint32_t max_par_cb, - uint32_t max_long_cb); - -SRSLTE_API void srslte_tdec_simd_free(srslte_tdec_simd_t * h); - -SRSLTE_API int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, - uint32_t long_cb); - -SRSLTE_API - -SRSLTE_API int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h, - uint32_t cb_idx); - -SRSLTE_API int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h, - uint32_t cb_idx); - -SRSLTE_API void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, - int16_t * input[SRSLTE_TDEC_MAX_NPAR], - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h, - uint8_t *output, - uint32_t cbidx, - uint32_t long_cb); - -SRSLTE_API int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, - int16_t * input[SRSLTE_TDEC_MAX_NPAR], - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t nof_iterations, - uint32_t long_cb); - -#endif // SRSLTE_TURBODECODER_SIMD_H diff --git a/lib/include/srslte/phy/fec/turbodecoder_simd_inter.h b/lib/include/srslte/phy/fec/turbodecoder_simd_inter.h deleted file mode 100644 index 054fbc3cb..000000000 --- a/lib/include/srslte/phy/fec/turbodecoder_simd_inter.h +++ /dev/null @@ -1,119 +0,0 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -/********************************************************************************************** - * File: turbodecoder.h - * - * Description: Turbo Decoder. - * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent - * encoders and one turbo code internal interleaver. The coding rate of turbo - * encoder is 1/3. - * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. - * - * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 - *********************************************************************************************/ - -#ifndef SRSLTE_TURBODECODER_SIMD_INTER_H -#define SRSLTE_TURBODECODER_SIMD_INTER_H - - -/** This is an simd inter-frame parallel turbo decoder. Parallizes 8 code-blocks using SSE - * This implementation is currently not functional and not used by the rest of the code - */ - -#include "srslte/config.h" -#include "srslte/phy/fec/tc_interl.h" -#include "srslte/phy/fec/cbsegm.h" - -#if LV_HAVE_AVX2 - #define SRSLTE_TDEC_MAX_NPAR 16 -#else - #define SRSLTE_TDEC_MAX_NPAR 8 -#endif - - -typedef struct SRSLTE_API { - int max_long_cb; - - int16_t *syst0; - int16_t *parity0; - int16_t *syst1; - int16_t *parity1; - int16_t *llr1; - int16_t *llr2; - int16_t *w; - int16_t *alpha; - - uint32_t max_par_cb; - int current_cbidx; - uint32_t current_long_cb; - srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES]; - int n_iter[SRSLTE_TDEC_MAX_NPAR]; -} srslte_tdec_simd_inter_t; - -SRSLTE_API int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h, - uint32_t max_par_cb, - uint32_t max_long_cb); - -SRSLTE_API void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h); - -SRSLTE_API int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h, - uint32_t long_cb); - -SRSLTE_API int srslte_tdec_simd_inter_get_nof_iterations_cb(srslte_tdec_simd_inter_t * h, - uint32_t cb_idx); - -SRSLTE_API int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h, - uint32_t cb_idx); - -SRSLTE_API void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h, - int16_t * input[SRSLTE_TDEC_MAX_NPAR], - uint32_t nof_cb, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h, - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t nof_cb, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h, - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t nof_cb, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h, - uint8_t *output, - uint32_t cbidx, - uint32_t long_cb); - -SRSLTE_API int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h, - int16_t *input[SRSLTE_TDEC_MAX_NPAR], - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t nof_iterations, - uint32_t nof_cb, - uint32_t long_cb); - -#endif // SRSLTE_TURBODECODER_SIMD_INTER_H diff --git a/lib/include/srslte/phy/fec/turbodecoder_sse.h b/lib/include/srslte/phy/fec/turbodecoder_sse.h index 9678fba9a..a02654238 100644 --- a/lib/include/srslte/phy/fec/turbodecoder_sse.h +++ b/lib/include/srslte/phy/fec/turbodecoder_sse.h @@ -1,101 +1,45 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -/********************************************************************************************** - * File: turbodecoder.h - * - * Description: Turbo Decoder. - * Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent - * encoders and one turbo code internal interleaver. The coding rate of turbo - * encoder is 1/3. - * MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder. - * - * Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2 - *********************************************************************************************/ - -#ifndef SRSLTE_TURBODECODER_SSE_ -#define SRSLTE_TURBODECODER_SSE_ - -#include "srslte/config.h" -#include "srslte/phy/fec/tc_interl.h" -#include "srslte/phy/fec/cbsegm.h" - -#define SRSLTE_TCOD_RATE 3 -#define SRSLTE_TCOD_TOTALTAIL 12 - -#define SRSLTE_TCOD_MAX_LEN_CB 6144 -#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL) - -typedef struct SRSLTE_API { - int max_long_cb; - int16_t *alpha; - int16_t *branch; -} map_gen_t; - -typedef struct SRSLTE_API { - int max_long_cb; - - map_gen_t dec; - - int16_t *app1; - int16_t *app2; - int16_t *ext1; - int16_t *ext2; - int16_t *syst; - int16_t *parity0; - int16_t *parity1; - - int current_cbidx; - srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES]; - int n_iter; -} srslte_tdec_sse_t; - -SRSLTE_API int srslte_tdec_sse_init(srslte_tdec_sse_t * h, - uint32_t max_long_cb); - -SRSLTE_API void srslte_tdec_sse_free(srslte_tdec_sse_t * h); - -SRSLTE_API int srslte_tdec_sse_reset(srslte_tdec_sse_t * h, uint32_t long_cb); - -SRSLTE_API void srslte_tdec_sse_iteration(srslte_tdec_sse_t * h, - int16_t * input, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_sse_decision(srslte_tdec_sse_t * h, - uint8_t *output, - uint32_t long_cb); - -SRSLTE_API void srslte_tdec_sse_decision_byte(srslte_tdec_sse_t * h, - uint8_t *output, - uint32_t long_cb); - -SRSLTE_API int srslte_tdec_sse_run_all(srslte_tdec_sse_t * h, - int16_t * input, - uint8_t *output, - uint32_t nof_iterations, - uint32_t long_cb); - -#endif // SRSLTE_TURBODECODER_SSE_ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#ifndef SRSLTE_TURBODECODER_SSE_H +#define SRSLTE_TURBODECODER_SSE_H + +#include "srslte/config.h" + +typedef struct SRSLTE_API { + uint32_t max_long_cb; + int16_t *alpha; + int16_t *branch; +} tdec_sse_t; + +int tdec_sse_init(void **h, uint32_t max_long_cb); +void tdec_sse_free(void *h); +void tdec_sse_dec(void *h, int16_t * input, int16_t *app, int16_t * parity, + int16_t *output, uint32_t long_cb); +void tdec_sse_extract_input(int16_t *input, int16_t *syst, int16_t *parity0, int16_t *parity1, int16_t *app2, uint32_t long_cb); +void tdec_sse_decision_byte(int16_t *app1, uint8_t *output, uint32_t long_cb); + +#endif // SRSLTE_TURBODECODER_SSE_H diff --git a/lib/include/srslte/phy/fec/turbodecoder_win.h b/lib/include/srslte/phy/fec/turbodecoder_win.h new file mode 100644 index 000000000..c4941278d --- /dev/null +++ b/lib/include/srslte/phy/fec/turbodecoder_win.h @@ -0,0 +1,750 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include "srslte/config.h" + +#define MAKE_FUNC(a) CONCAT2(CONCAT2(tdec_win,WINIMP),CONCAT2(_,a)) +#define MAKE_TYPE CONCAT2(CONCAT2(tdec_win_,WINIMP),_t) + +#ifdef WINIMP_IS_SSE16 + + #ifndef LV_HAVE_SSE + #error "Selected SSE window decoder but instruction set not supported" + #endif + + #include + + #define WINIMP sse16 + #define nof_blocks 8 + + #define llr_t int16_t + + #define simd_type_t __m128i + #define simd_load _mm_load_si128 + #define simd_store _mm_store_si128 + #define simd_add _mm_adds_epi16 + #define simd_sub _mm_subs_epi16 + #define simd_max _mm_max_epi16 + #define simd_set1 _mm_set1_epi16 + #define simd_insert _mm_insert_epi16 + #define simd_shuffle _mm_shuffle_epi8 + #define move_right _mm_set_epi8(15,14,15,14,13,12,11,10,9,8,7,6,5,4,3,2) + #define move_left _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,1,0) + #define simd_rb_shift _mm_srai_epi16 + + #define normalize_period 2 + #define win_overlap_len 40 + +#define divide_output 1 + +#define INF 10000 + +#else +#ifdef WINIMP_IS_AVX16 + + #ifndef LV_HAVE_AVX + #error "Selected AVX window decoder but instruction set not supported" + #endif + + #include + + #define WINIMP avx16 + #define nof_blocks 16 + + #define llr_t int16_t + + #define simd_type_t __m256i + #define simd_load _mm256_load_si256 + #define simd_store _mm256_store_si256 + #define simd_add _mm256_adds_epi16 + #define simd_sub _mm256_subs_epi16 + #define simd_max _mm256_max_epi16 + #define simd_set1 _mm256_set1_epi16 + #define simd_insert _mm256_insert_epi16 + #define simd_shuffle _mm256_shuffle_epi8 + #define move_right _mm256_set_epi8(31,30,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2) + #define move_left _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,1,0) + + #define normalize_period 2 + #define win_overlap_len 40 + + #define INF 10000 +#else + +#ifdef WINIMP_IS_SSE8 + + #ifndef LV_HAVE_SSE + #error "Selected SSE window decoder but instruction set not supported" + #endif + + #include + + #define WINIMP sse8 + #define nof_blocks 16 + + #define llr_t int8_t + + #define simd_type_t __m128i + #define simd_load _mm_load_si128 + #define simd_store _mm_store_si128 + #define simd_add _mm_adds_epi8 + #define simd_sub _mm_subs_epi8 + #define simd_max _mm_max_epi8 + #define simd_set1 _mm_set1_epi8 + #define simd_insert _mm_insert_epi8 + #define simd_shuffle _mm_shuffle_epi8 + #define move_right _mm_set_epi8(15,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1) + #define move_left _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0) + #define simd_rb_shift simd_rb_shift_128 + + #define normalize_max + #define normalize_period 1 + #define win_overlap_len 40 + #define use_saturated_add + #define divide_output 1 + + #define INF 0 + + inline static simd_type_t simd_rb_shift_128(simd_type_t v, const int l) { + __m128i low = _mm_srai_epi16(_mm_slli_epi16(v,8), l+8); + __m128i hi = _mm_srai_epi16(v,l); + return _mm_blendv_epi8(hi, low, _mm_set1_epi32(0x00FF00FF)); + } + + +#else + +#ifdef WINIMP_IS_AVX8 + + #ifndef LV_HAVE_AVX + #error "Selected AVX window decoder but instruction set not supported" + #endif + + #include + + #define WINIMP avx8 + #define nof_blocks 32 + + #define llr_t int8_t + + #define simd_type_t __m256i + #define simd_load _mm256_load_si256 + #define simd_store _mm256_store_si256 + #define simd_add _mm256_adds_epi8 + #define simd_sub _mm256_subs_epi8 + #define simd_max _mm256_max_epi8 + #define simd_set1 _mm256_set1_epi8 + #define simd_insert _mm256_insert_epi8 + #define simd_shuffle _mm256_shuffle_epi8 + #define move_right _mm256_set_epi8(31,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1) + #define move_left _mm256_set_epi8(30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0) + #define simd_rb_shift simd_rb_shift_256 + + #define INF 0 + + #define normalize_max + #define normalize_period 1 + #define win_overlap_len 40 + #define use_saturated_add + #define divide_output 1 + + inline static simd_type_t simd_rb_shift_256(simd_type_t v, const int l) { + __m256i low = _mm256_srai_epi16(_mm256_slli_epi16(v,8), l+8); + __m256i hi = _mm256_srai_epi16(v,l); + return _mm256_blendv_epi8(hi, low, _mm256_set1_epi32(0x00FF00FF)); + } + + +#else + #error "Unknown WINIMP value" +#endif +#endif +#endif +#endif + +typedef struct SRSLTE_API { + uint32_t max_long_cb; + llr_t *beta; +} MAKE_TYPE; + + +#define long_sb (long_cb/nof_blocks) + + + +#define debug_enabled_win 0 + +#if debug_enabled_win +#define debug_state(d) printf("k=%5d, in=%5d, pa=%3d, out=%5d, alpha=[", d*long_sb+k+1, MAKE_FUNC(get_simd)(x,d), MAKE_FUNC(get_simd)(y,d), MAKE_FUNC(get_simd)(out,d)); \ + for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(old[j],d)); \ + printf("], beta=["); \ + for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(beta_save[j], d));printf("\n"); + +#define debug_state_pre(d) printf("pre-window k=%5d, in=%5d, pa=%3d, alpha=[", (d+1)*long_sb-loop_len+k+1, MAKE_FUNC(get_simd)(x,d), MAKE_FUNC(get_simd)(y,d)); \ + for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(old[j],d)); \ + printf("]\n"); + +#define debug_state_beta(d) printf("k=%5d, in=%5d, pa=%3d, beta=[", d*long_sb+k, MAKE_FUNC(get_simd)(x,d), MAKE_FUNC(get_simd)(y,d)); \ + for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(old[j],d));\ + printf("\n"); + +static llr_t MAKE_FUNC(get_simd)(simd_type_t x, uint32_t pos) { + llr_t *s = (llr_t*) &x; + return s[pos]; +} + + +#else +#define debug_state(a) +#define debug_state_pre(a) +#define debug_state_beta(a) +#endif +/* +static void MAKE_FUNC(print_simd)(simd_type_t x) { + llr_t *s = (llr_t*) &x; + printf("["); + for (int i=0;i127?127:(int8_t) z; +#endif +} + +inline static void MAKE_FUNC(normalize)(uint32_t k, simd_type_t old[8]) { + if ((k % normalize_period) == 0 && k != 0) { +#ifdef normalize_max + simd_type_t m = simd_max(old[0],old[1]); + for (int i=2;i<8;i++) { + m = simd_max(m,old[i]); + } + for (int i=0;i<8;i++) { + old[i] = simd_sub(old[i], m); + } +#else + for (int i = 1; i < 8; i++) { + old[i] = simd_sub(old[i], old[0]); + } + old[0] = simd_set1(0); +#endif + } +} + +static void MAKE_FUNC(beta_trellis)(llr_t *input, llr_t *parity, uint32_t long_cb, llr_t old[8]) +{ + llr_t m_b[8], new[8]; + llr_t x, y, xy; + + /* Calculate last state using Tail. No need to use SIMD here */ + old[0] = 0; + for (int i = 1; i < 8; i++) { + old[i] = -INF; + } + for (int k=long_cb+2;k >= long_cb; k--) { + x = input[k]; + y = parity[k]; + + xy = MAKE_FUNC(sadd)(x, y); + + m_b[0] = MAKE_FUNC(sadd)(old[4],xy); + m_b[1] = old[4]; + m_b[2] = MAKE_FUNC(sadd)(old[5], y); + m_b[3] = MAKE_FUNC(sadd)(old[5], x); + m_b[4] = MAKE_FUNC(sadd)(old[6], x); + m_b[5] = MAKE_FUNC(sadd)(old[6], y); + m_b[6] = old[7]; + m_b[7] = MAKE_FUNC(sadd)(old[7], xy); + + new[0] = old[0]; + new[1] = MAKE_FUNC(sadd)(old[0], xy); + new[2] = MAKE_FUNC(sadd)(old[1], x); + new[3] = MAKE_FUNC(sadd)(old[1], y); + new[4] = MAKE_FUNC(sadd)(old[2], y); + new[5] = MAKE_FUNC(sadd)(old[2], x); + new[6] = MAKE_FUNC(sadd)(old[3], xy); + new[7] = old[3]; + +#if debug_enabled_win + printf("trellis: k=%d, in=%d, pa=%d, beta: ", k, x, y); for (int i=0;i<8;i++) {printf("%d,", old[i]);} printf("\n"); +#endif + + for (int i = 0; i < 8; i++) { + if (m_b[i] > new[i]) + new[i] = m_b[i]; + old[i] = new[i]; + } + } +} + +/* Computes beta values */ +static void MAKE_FUNC(beta)(MAKE_TYPE * s, llr_t *input, llr_t *app, llr_t *parity, uint32_t long_cb) +{ + simd_type_t m_b[8], new[8], old[8]; + simd_type_t x, y, xy, ap; + + simd_type_t *inputPtr; + simd_type_t *appPtr; + simd_type_t *parityPtr; + simd_type_t *betaPtr = (simd_type_t*) s->beta; + + uint32_t loop_len; + for (int j=0;j<2;j++) { + + // First run L states to find initial state for all sub-blocks after first + if (j==0) { + loop_len = win_overlap_len; + } else { + loop_len = long_sb; + } + + // When passing through all window pick estimated initial states (known state for sb=0) + if (loop_len == long_sb) { + + // shuffle across 128-bit boundary manually +#ifdef WINIMP_IS_AVX16 + llr_t tmp[8]; + for (int i = 0; i < 8; i++) { + tmp[i] = _mm256_extract_epi16(old[i], 8); + } +#endif +#ifdef WINIMP_IS_AVX8 + llr_t tmp[8]; + for (int i = 0; i < 8; i++) { + tmp[i] = _mm256_extract_epi8(old[i], 16); + } +#endif + + for (int i = 0; i < 8; i++) { + old[i] = simd_shuffle(old[i], move_right); + } + // last sub-block state is calculated from the trellis + llr_t trellis_old[8]; + MAKE_FUNC(beta_trellis)(input, parity, long_cb, trellis_old); + for (int i = 0; i < 8; i++) { + old[i] = simd_insert(old[i], trellis_old[i], nof_blocks-1); + } + +#ifdef WINIMP_IS_AVX16 + for (int i = 0; i < 8; i++) { + old[i] = _mm256_insert_epi16(old[i], tmp[i], 7); + } +#endif +#ifdef WINIMP_IS_AVX8 + for (int i = 0; i < 8; i++) { + old[i] = _mm256_insert_epi8(old[i], tmp[i], 15); + } +#endif + + inputPtr = (simd_type_t*) &input[long_cb-nof_blocks]; + appPtr = (simd_type_t*) &app[long_cb-nof_blocks]; + parityPtr = (simd_type_t*) &parity[long_cb-nof_blocks]; + + for (int i = 0; i < 8; i++) { + simd_store(&betaPtr[8*long_sb + i], old[i]); + } + + } else { + // when estimating states, just set all to unknown + for (int i = 0; i < 8; i++) { + old[i] = simd_set1(-INF); + } + inputPtr = (simd_type_t*) &input[nof_blocks*(loop_len-1)]; + appPtr = (simd_type_t*) &app[nof_blocks*(loop_len-1)]; + parityPtr = (simd_type_t*) &parity[nof_blocks*(loop_len-1)]; + } + + for (int k = loop_len - 1; k >= 0; k--) { + x = simd_load(inputPtr--); + y = simd_load(parityPtr--); + + if (app) { + ap = simd_load(appPtr--); + x = simd_add(ap, x); + } + + xy = simd_add(x, y); + + m_b[0] = simd_add(old[4], xy); + m_b[1] = old[4]; + m_b[2] = simd_add(old[5], y); + m_b[3] = simd_add(old[5], x); + m_b[4] = simd_add(old[6], x); + m_b[5] = simd_add(old[6], y); + m_b[6] = old[7]; + m_b[7] = simd_add(old[7], xy); + + new[0] = old[0]; + new[1] = simd_add(old[0], xy); + new[2] = simd_add(old[1], x); + new[3] = simd_add(old[1], y); + new[4] = simd_add(old[2], y); + new[5] = simd_add(old[2], x); + new[6] = simd_add(old[3], xy); + new[7] = old[3]; + + // Calculate maximum metric + for (int i = 0; i < 8; i++) { + old[i] = simd_max(m_b[i], new[i]); + } + // Store metric only when doing the final pass + if (loop_len == long_sb) { + for (int i = 0; i < 8; i++) { + simd_store(&betaPtr[8*k + i], old[i]); + } + } + if (loop_len!=long_sb) { + debug_state_beta(0); + } else { + debug_state_beta(0); + } + + // normalize + MAKE_FUNC(normalize)(k, old); + } + } +} + +/* Computes alpha metrics */ +static void MAKE_FUNC(alpha)(MAKE_TYPE * s, llr_t *input, llr_t *app, llr_t *parity, llr_t * output, uint32_t long_cb) +{ + simd_type_t m_b[8], new[8], old[8], max1[8], max0[8]; + simd_type_t x, y, xy, ap; + simd_type_t m1, m0; + + simd_type_t *inputPtr; + simd_type_t *appPtr; + simd_type_t *parityPtr; + simd_type_t *betaPtr = (simd_type_t*) s->beta; + simd_type_t *outputPtr = (simd_type_t*) output; + +#if debug_enabled_win + simd_type_t beta_save[8]; +#endif + + // Skip state 0 + betaPtr+=8; + + uint32_t loop_len; + + for (int j=0;j<2;j++) { + + // First run L states to find initial state for all sub-blocks after first + if (j==0) { + loop_len = win_overlap_len; + } else { + loop_len = long_sb; + } + + // When passing through all window pick estimated initial states (known state for sb=0) + if (loop_len == long_sb) { + +#ifdef WINIMP_IS_AVX16 + llr_t tmp[8]; + for (int i=0;i<8;i++) { + tmp[i] = _mm256_extract_epi16(old[i], 7); + } +#endif +#ifdef WINIMP_IS_AVX8 + llr_t tmp[8]; + for (int i=0;i<8;i++) { + tmp[i] = _mm256_extract_epi8(old[i], 15); + } +#endif + for (int i = 0; i < 8; i++) { + old[i] = simd_shuffle(old[i], move_left); + } +#ifdef WINIMP_IS_AVX16 + for (int i=0;i<8;i++) { + old[i] = _mm256_insert_epi16(old[i], tmp[i], 8); + } +#endif +#ifdef WINIMP_IS_AVX8 + for (int i=0;i<8;i++) { + old[i] = _mm256_insert_epi8(old[i], tmp[i], 16); + } +#endif + // 1st sub-block state is known + old[0] = simd_insert(old[0], 0, 0); + for (int i = 1; i < 8; i++) { + old[i] = simd_insert(old[i], -INF, 0); + } + } else { + // when estimating states, just set all to unknown + for (int i = 0; i < 8; i++) { + old[i] = simd_set1(-INF); + } + } + + inputPtr = (simd_type_t*) &input[nof_blocks*(long_sb-loop_len)]; + appPtr = (simd_type_t*) &app[nof_blocks*(long_sb-loop_len)]; + parityPtr = (simd_type_t*) &parity[nof_blocks*(long_sb-loop_len)]; + + for (int k = 0; k < loop_len; k++) { + x = simd_load(inputPtr++); + y = simd_load(parityPtr++); + + if (app) { + ap = simd_load(appPtr++); + x = simd_add(ap, x); + } + + xy = simd_add(x,y); + + m_b[0] = old[0]; + m_b[1] = simd_add(old[3], y); + m_b[2] = simd_add(old[4], y); + m_b[3] = old[7]; + m_b[4] = old[1]; + m_b[5] = simd_add(old[2], y); + m_b[6] = simd_add(old[5], y); + m_b[7] = old[6]; + + new[0] = simd_add(old[1], xy); + new[1] = simd_add(old[2], x); + new[2] = simd_add(old[5], x); + new[3] = simd_add(old[6], xy); + new[4] = simd_add(old[0], xy); + new[5] = simd_add(old[3], x); + new[6] = simd_add(old[4], x); + new[7] = simd_add(old[7], xy); + + // Load beta and compute output only when passing through all window + if (loop_len == long_sb) { + simd_type_t beta; + for (int i = 0; i < 8; i++) { + beta = simd_load(betaPtr++); + max0[i] = simd_add(beta, m_b[i]); + max1[i] = simd_add(beta, new[i]); + +#if debug_enabled_win + beta_save[i] = beta; +#endif + } + + m1 = simd_max(max1[0], max1[1]); + m0 = simd_max(max0[0], max0[1]); + + for (int i = 2; i < 8; i++) { + m1 = simd_max(m1, max1[i]); + m0 = simd_max(m0, max0[i]); + } + + simd_type_t out = simd_sub(m1, m0); + + // Divide output when using 8-bit arithmetic +#ifdef divide_output + out = simd_rb_shift(out, divide_output); +#endif + + simd_store(outputPtr++, out); + + debug_state(0); + } + + for (int i = 0; i < 8; i++) { + old[i] = simd_max(m_b[i], new[i]); + } + + // normalize + MAKE_FUNC(normalize)(k, old); + + if (loop_len != long_sb) { + debug_state_pre(0); + } + } + } +} + +int MAKE_FUNC(init)(void **hh, uint32_t max_long_cb) +{ + *hh = calloc(1, sizeof(MAKE_TYPE)); + + MAKE_TYPE *h = (MAKE_TYPE*) *hh; + + h->beta = srslte_vec_malloc(sizeof(llr_t) * 8 * max_long_cb * nof_blocks); + if (!h->beta) { + perror("srslte_vec_malloc"); + return -1; + } + h->max_long_cb = max_long_cb; + return nof_blocks; +} + +void MAKE_FUNC(free)(void *hh) +{ + MAKE_TYPE *h = (MAKE_TYPE*) hh; + if (h->beta) { + free(h->beta); + } + bzero(h, sizeof(MAKE_TYPE)); +} + +void MAKE_FUNC(dec)(void *hh, llr_t *input, llr_t *app, llr_t *parity, llr_t *output, uint32_t long_cb) +{ + MAKE_TYPE *h = (MAKE_TYPE*) hh; + MAKE_FUNC(beta)(h, input, app, parity, long_cb); + MAKE_FUNC(alpha)(h, input, app, parity, output, long_cb); +#if debug_enabled_win + printf("running win decoder: %s\n", STRING(WINIMP)); +#endif +} + +#define INSERT8_INPUT(reg, st, off) reg = simd_insert(reg, input[3*(i+(st+0)*long_sb)+off], st+0);\ + reg = simd_insert(reg, input[3*(i+(st+1)*long_sb)+off], st+1);\ + reg = simd_insert(reg, input[3*(i+(st+2)*long_sb)+off], st+2);\ + reg = simd_insert(reg, input[3*(i+(st+3)*long_sb)+off], st+3);\ + reg = simd_insert(reg, input[3*(i+(st+4)*long_sb)+off], st+4);\ + reg = simd_insert(reg, input[3*(i+(st+5)*long_sb)+off], st+5);\ + reg = simd_insert(reg, input[3*(i+(st+6)*long_sb)+off], st+6);\ + reg = simd_insert(reg, input[3*(i+(st+7)*long_sb)+off], st+7); + + +void MAKE_FUNC(extract_input)(llr_t *input, llr_t *systematic, llr_t *app2, llr_t *parity_0, llr_t *parity_1, uint32_t long_cb) +{ + simd_type_t *systPtr = (simd_type_t*) systematic; + simd_type_t *parity0Ptr = (simd_type_t*) parity_0; + simd_type_t *parity1Ptr = (simd_type_t*) parity_1; + + simd_type_t syst, parity0, parity1; + + for (int i=0;i= 16 + INSERT8_INPUT(syst, 8, 0); + INSERT8_INPUT(parity0, 8, 1); + INSERT8_INPUT(parity1, 8, 2); +#endif + +#if nof_blocks >= 32 + INSERT8_INPUT(syst, 16, 0); + INSERT8_INPUT(parity0, 16, 1); + INSERT8_INPUT(parity1, 16, 2); + INSERT8_INPUT(syst, 24, 0); + INSERT8_INPUT(parity0, 24, 1); + INSERT8_INPUT(parity1, 24, 2); +#endif + + simd_store(systPtr++, syst); + simd_store(parity0Ptr++, parity0); + simd_store(parity1Ptr++, parity1); + } + + for (int i = long_cb; i < long_cb + 3; i++) { + systematic[i] = input[3*long_cb + 2*(i - long_cb)]; + parity_0[i] = input[3*long_cb + 2*(i - long_cb) + 1]; + + app2[i] = input[3*long_cb + 6 + 2*(i - long_cb)]; + parity_1[i] = input[3*long_cb + 6 + 2*(i - long_cb) + 1]; + } +} + +#define deinter(x,win) ((x%(long_cb/win))*(win)+x/(long_cb/win)) + +#define reset_cnt(a,b) if(!((a+1)%b)) { \ + k+=b*nof_blocks; \ + if (k >= long_cb) { \ + k -= (long_cb-1);\ + }\ + } +#define insert_bit(a,b) ap = _mm_insert_epi16(ap, app1[k+(a%b)*nof_blocks], 7-a); \ + reset_cnt(a,b); \ + + +#define decide_for(b) for (uint32_t i = 0; i < long_cb/8; i++) { \ + insert_bit(0,b);\ + insert_bit(1,b);\ + insert_bit(2,b);\ + insert_bit(3,b);\ + insert_bit(4,b);\ + insert_bit(5,b);\ + insert_bit(6,b);\ + insert_bit(7,b);\ + output[i] = (uint8_t) _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_packs_epi16(ap,zeros),zeros));\ + } + +/* No improvement to use AVX here */ +void MAKE_FUNC(decision_byte)(llr_t *app1, uint8_t *output, uint32_t long_cb) +{ + uint32_t k=0; + __m128i zeros = _mm_setzero_si128(); + __m128i ap; + + if ((long_cb%(nof_blocks*8)) == 0) { + decide_for(8); + } else if ((long_cb%(nof_blocks*4)) == 0) { + decide_for(4); + } else if ((long_cb%(nof_blocks*2)) == 0) { + decide_for(2); + } else { + decide_for(1); + } +} + + +#undef WINIMP +#undef nof_blocks +#undef llr_t +#undef normalize_period +#undef INF +#undef win_overlap_len +#undef simd_type_t +#undef simd_load +#undef simd_store +#undef simd_add +#undef simd_sub +#undef simd_max +#undef simd_set1 +#undef simd_insert +#undef simd_shuffle +#undef move_right +#undef move_left +#undef debug_enabled_win + +#ifdef normalize_max +#undef normalize_max +#endif + +#ifdef use_saturated_add +#undef use_saturated_add +#endif + +#ifdef simd_rb_shift +#undef simd_rb_shift +#endif + +#ifdef divide_output +#undef divide_output +#endif \ No newline at end of file diff --git a/lib/include/srslte/phy/modem/demod_soft.h b/lib/include/srslte/phy/modem/demod_soft.h index 9ad065edc..eb1924cae 100644 --- a/lib/include/srslte/phy/modem/demod_soft.h +++ b/lib/include/srslte/phy/modem/demod_soft.h @@ -53,4 +53,9 @@ SRSLTE_API int srslte_demod_soft_demodulate_s(srslte_mod_t modulation, short* llr, int nsymbols); +SRSLTE_API int srslte_demod_soft_demodulate_b(srslte_mod_t modulation, + const cf_t* symbols, + int8_t* llr, + int nsymbols); + #endif // SRSLTE_DEMOD_SOFT_H diff --git a/lib/include/srslte/phy/phch/pdsch.h b/lib/include/srslte/phy/phch/pdsch.h index f1ecbc348..787853d0a 100644 --- a/lib/include/srslte/phy/phch/pdsch.h +++ b/lib/include/srslte/phy/phch/pdsch.h @@ -65,6 +65,8 @@ typedef struct SRSLTE_API { uint16_t ue_rnti; bool is_ue; + bool llr_is_8bit; + /* Power allocation parameter 3GPP 36.213 Clause 5.2 Rho_b */ float rho_a; diff --git a/lib/include/srslte/phy/phch/pusch.h b/lib/include/srslte/phy/phch/pusch.h index a5b8f04ae..027fc02b2 100644 --- a/lib/include/srslte/phy/phch/pusch.h +++ b/lib/include/srslte/phy/phch/pusch.h @@ -74,6 +74,8 @@ typedef struct SRSLTE_API { uint16_t ue_rnti; uint32_t max_re; + bool llr_is_8bit; + srslte_dft_precoding_t dft_precoding; /* buffers */ diff --git a/lib/include/srslte/phy/phch/sch.h b/lib/include/srslte/phy/phch/sch.h index 003e9b7df..50077f417 100644 --- a/lib/include/srslte/phy/phch/sch.h +++ b/lib/include/srslte/phy/phch/sch.h @@ -59,6 +59,8 @@ typedef struct SRSLTE_API { uint32_t max_iterations; uint32_t nof_iterations; + bool llr_is_8bit; + /* buffers */ uint8_t *cb_in; uint8_t *parity_bits; diff --git a/lib/include/srslte/phy/scrambling/scrambling.h b/lib/include/srslte/phy/scrambling/scrambling.h index c38bbe8c9..66b4b1431 100644 --- a/lib/include/srslte/phy/scrambling/scrambling.h +++ b/lib/include/srslte/phy/scrambling/scrambling.h @@ -68,6 +68,11 @@ SRSLTE_API void srslte_scrambling_s_offset(srslte_sequence_t *s, int offset, int len); +SRSLTE_API void srslte_scrambling_sb_offset(srslte_sequence_t *s, + int8_t *data, + int offset, + int len); + SRSLTE_API void srslte_scrambling_c(srslte_sequence_t *s, cf_t *data); diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index 2a7566e18..6f2e18e46 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -100,6 +100,7 @@ #define SRSLTE_SIMD_I_SIZE 16 +#define SRSLTE_SIMD_B_SIZE 64 #define SRSLTE_SIMD_S_SIZE 32 #define SRSLTE_SIMD_C16_SIZE 0 @@ -111,6 +112,7 @@ #define SRSLTE_SIMD_I_SIZE 8 +#define SRSLTE_SIMD_B_SIZE 32 #define SRSLTE_SIMD_S_SIZE 16 #define SRSLTE_SIMD_C16_SIZE 16 @@ -122,6 +124,7 @@ #define SRSLTE_SIMD_I_SIZE 4 +#define SRSLTE_SIMD_B_SIZE 16 #define SRSLTE_SIMD_S_SIZE 8 #define SRSLTE_SIMD_C16_SIZE 8 @@ -132,7 +135,7 @@ #define SRSLTE_SIMD_CF_SIZE 4 #define SRSLTE_SIMD_I_SIZE 4 - +#define SRSLTE_SIMD_B_SIZE 16 #define SRSLTE_SIMD_S_SIZE 8 #define SRSLTE_SIMD_C16_SIZE 8 @@ -141,7 +144,7 @@ #define SRSLTE_SIMD_CF_SIZE 0 #define SRSLTE_SIMD_I_SIZE 0 - +#define SRSLTE_SIMD_B_SIZE 0 #define SRSLTE_SIMD_S_SIZE 0 #define SRSLTE_SIMD_C16_SIZE 0 @@ -1336,6 +1339,24 @@ static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) { #endif /* LV_HAVE_AVX512 */ } +static inline simd_s_t srslte_simd_s_neg(simd_s_t a, simd_s_t b) { +#ifdef LV_HAVE_AVX512 +#error sign instruction not available in avx512 +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_sign_epi16(a, b); +#else /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE + return _mm_sign_epi16(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + #error sign instruction not available in Neon +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + static inline simd_s_t srslte_simd_s_add(simd_s_t a, simd_s_t b) { #ifdef LV_HAVE_AVX512 return _mm512_add_epi16(a, b); @@ -1681,7 +1702,7 @@ typedef int8x16_t simd_b_t; -static inline simd_b_t srslte_simd_b_load(int8_t *ptr){ +static inline simd_b_t srslte_simd_b_load(const int8_t *ptr){ #ifdef LV_HAVE_AVX512 return _mm512_load_si512(ptr); #else /* LV_HAVE_AVX512 */ @@ -1699,7 +1720,7 @@ static inline simd_b_t srslte_simd_b_load(int8_t *ptr){ #endif /* LV_HAVE_AVX512 */ } -static inline simd_b_t srslte_simd_b_loadu(int8_t *ptr){ +static inline simd_b_t srslte_simd_b_loadu(const int8_t *ptr){ #ifdef LV_HAVE_AVX512 return _mm512_loadu_si512(ptr); #else /* LV_HAVE_AVX512 */ @@ -1773,6 +1794,44 @@ static inline simd_b_t srslte_simd_b_xor(simd_b_t a, simd_b_t b) { #endif /* LV_HAVE_AVX512 */ } +static inline simd_s_t srslte_simd_b_sub(simd_s_t a, simd_s_t b) { +#ifdef LV_HAVE_AVX512 + return _mm512_subs_epi8(a, b); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_subs_epi8(a, b); +#else /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE + return _mm_subs_epi8(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vsubqs_s8(a, b); +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + +static inline simd_s_t srslte_simd_b_neg(simd_b_t a, simd_b_t b) { +#ifdef LV_HAVE_AVX512 +#error sign instruction not available in avx512 +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_sign_epi8(a, b); +#else /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE + return _mm_sign_epi8(a, b); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + #error sign instruction not available in Neon +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + + + #endif /*SRSLTE_SIMD_B_SIZE */ diff --git a/lib/include/srslte/phy/utils/vector.h b/lib/include/srslte/phy/utils/vector.h index 32629da3e..c9e491027 100644 --- a/lib/include/srslte/phy/utils/vector.h +++ b/lib/include/srslte/phy/utils/vector.h @@ -69,6 +69,7 @@ SRSLTE_API void *srslte_vec_realloc(void *ptr, uint32_t old_size, uint32_t new_s SRSLTE_API void srslte_vec_fprint_c(FILE *stream, cf_t *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_f(FILE *stream, float *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_b(FILE *stream, uint8_t *x, const uint32_t len); +SRSLTE_API void srslte_vec_fprint_bs(FILE *stream, int8_t *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_byte(FILE *stream, uint8_t *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_i(FILE *stream, int *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_s(FILE *stream, short *x, const uint32_t len); @@ -82,12 +83,13 @@ SRSLTE_API void srslte_vec_load_file(char *filename, void *buffer, const uint32_ /* sum two vectors */ SRSLTE_API void srslte_vec_sum_fff(const float *x, const float *y, float *z, const uint32_t len); SRSLTE_API void srslte_vec_sum_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len); -SRSLTE_API void srslte_vec_sub_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len); SRSLTE_API void srslte_vec_sum_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len); /* substract two vectors z=x-y */ SRSLTE_API void srslte_vec_sub_fff(const float *x, const float *y, float *z, const uint32_t len); SRSLTE_API void srslte_vec_sub_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len); +SRSLTE_API void srslte_vec_sub_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len); +SRSLTE_API void srslte_vec_sub_bbb(const int8_t *x, const int8_t *y, int8_t *z, const uint32_t len); /* scalar product */ SRSLTE_API void srslte_vec_sc_prod_cfc(const cf_t *x, const float h, cf_t *z, const uint32_t len); @@ -97,8 +99,10 @@ SRSLTE_API void srslte_vec_sc_prod_fff(const float *x, const float h, float *z, SRSLTE_API void srslte_vec_convert_fi(const float *x, const float scale, int16_t *z, const uint32_t len); SRSLTE_API void srslte_vec_convert_if(const int16_t *x, const float scale, float *z, const uint32_t len); +SRSLTE_API void srslte_vec_convert_fb(const float *x, const float scale, int8_t *z, const uint32_t len); SRSLTE_API void srslte_vec_lut_sss(const short *x, const unsigned short *lut, short *y, const uint32_t len); +SRSLTE_API void srslte_vec_lut_bbb(const int8_t *x, const unsigned short *lut, int8_t *y, const uint32_t len); SRSLTE_API void srslte_vec_lut_sis(const short *x, const unsigned int *lut, short *y, const uint32_t len); /* vector product (element-wise) */ @@ -115,6 +119,10 @@ SRSLTE_API void srslte_vec_prod_conj_ccc(const cf_t *x, const cf_t *y, cf_t *z, SRSLTE_API void srslte_vec_prod_fff(const float *x, const float *y, float *z, const uint32_t len); SRSLTE_API void srslte_vec_prod_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len); +// Negate sign (scrambling) +SRSLTE_API void srslte_vec_neg_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len); +SRSLTE_API void srslte_vec_neg_bbb(const int8_t *x, const int8_t *y, int8_t *z, const uint32_t len); + /* Dot-product */ SRSLTE_API cf_t srslte_vec_dot_prod_cfc(const cf_t *x, const float *y, const uint32_t len); SRSLTE_API cf_t srslte_vec_dot_prod_ccc(const cf_t *x, const cf_t *y, const uint32_t len); diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h index 68ddbdee0..cd198df8f 100644 --- a/lib/include/srslte/phy/utils/vector_simd.h +++ b/lib/include/srslte/phy/utils/vector_simd.h @@ -62,6 +62,8 @@ SRSLTE_API void srslte_vec_sum_sss_simd(const int16_t *x, const int16_t *y, int1 SRSLTE_API void srslte_vec_sub_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, int len); +SRSLTE_API void srslte_vec_sub_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, int len); + SRSLTE_API float srslte_vec_acc_ff_simd(const float *x, int len); SRSLTE_API cf_t srslte_vec_acc_cc_simd(const cf_t *x, int len); @@ -86,6 +88,10 @@ SRSLTE_API void srslte_vec_prod_ccc_c16_simd(const int16_t *a_re, const int16_t SRSLTE_API void srslte_vec_prod_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, const int len); +SRSLTE_API void srslte_vec_neg_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, const int len); + +SRSLTE_API void srslte_vec_neg_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, const int len); + SRSLTE_API void srslte_vec_prod_cfc_simd(const cf_t *x, const float *y, cf_t *z, const int len); SRSLTE_API void srslte_vec_prod_fff_simd(const float *x, const float *y, float *z, const int len); @@ -120,10 +126,14 @@ SRSLTE_API void srslte_vec_abs_square_cf_simd(const cf_t *x, float *z, const int /* Other Functions */ SRSLTE_API void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len); +SRSLTE_API void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t *y, const int len); + SRSLTE_API void srslte_vec_convert_if_simd(const int16_t *x, float *z, const float scale, const int len); SRSLTE_API void srslte_vec_convert_fi_simd(const float *x, int16_t *z, const float scale, const int len); +SRSLTE_API void srslte_vec_convert_fb_simd(const float *x, int8_t *z, const float scale, const int len); + SRSLTE_API void srslte_vec_cp_simd(const cf_t *src, cf_t *dst, int len); SRSLTE_API void srslte_vec_interleave_simd(const cf_t *x, const cf_t *y, cf_t *z, const int len); diff --git a/lib/src/common/threads.c b/lib/src/common/threads.c index b6fa31778..024faa64a 100644 --- a/lib/src/common/threads.c +++ b/lib/src/common/threads.c @@ -53,8 +53,9 @@ bool threads_new_rt_cpu(pthread_t *thread, void *(*start_routine) (void*), void cpu_set_t cpuset; bool attr_enable = false; +#ifdef PER_THREAD_PRIO if (prio_offset >= 0) { - param.sched_priority = sched_get_priority_max(SCHED_FIFO) - prio_offset; + param.sched_priority = sched_get_priority_max(SCHED_FIFO) - prio_offset; pthread_attr_init(&attr); if (pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED)) { perror("pthread_attr_setinheritsched"); @@ -82,6 +83,25 @@ bool threads_new_rt_cpu(pthread_t *thread, void *(*start_routine) (void*), void } attr_enable = true; } else if (prio_offset == -2) { +#else + // All threads have normal priority except prio_offset=0,1,2,3,4 + if (prio_offset >= 0 && prio_offset < 5) { + param.sched_priority = 50; + pthread_attr_init(&attr); + if (pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED)) { + perror("pthread_attr_setinheritsched"); + } + if (pthread_attr_setschedpolicy(&attr, SCHED_FIFO)) { + perror("pthread_attr_setschedpolicy"); + } + if (pthread_attr_setschedparam(&attr, ¶m)) { + perror("pthread_attr_setschedparam"); + fprintf(stderr, "Error not enough privileges to set Scheduling priority\n"); + } + attr_enable = true; + + } else { +#endif param.sched_priority = 0; pthread_attr_init(&attr); if (pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED)) { diff --git a/lib/src/phy/common/sequence.c b/lib/src/phy/common/sequence.c index 0ecf9f159..52497f5de 100644 --- a/lib/src/phy/common/sequence.c +++ b/lib/src/phy/common/sequence.c @@ -139,6 +139,7 @@ int srslte_sequence_LTE_pr(srslte_sequence_t *q, uint32_t len, uint32_t seed) { for (int i=0;ic_float[i] = (1-2*q->c[i]); q->c_short[i] = (int16_t) q->c_float[i]; + q->c_char[i] = (int8_t) q->c_float[i];; } return SRSLTE_SUCCESS; } @@ -164,6 +165,10 @@ int srslte_sequence_init(srslte_sequence_t *q, uint32_t len) { if (!q->c_short) { return SRSLTE_ERROR; } + q->c_char = srslte_vec_malloc(len * sizeof(int8_t)); + if (!q->c_char) { + return SRSLTE_ERROR; + } q->max_len = len; } return SRSLTE_SUCCESS; @@ -182,6 +187,9 @@ void srslte_sequence_free(srslte_sequence_t *q) { if (q->c_short) { free(q->c_short); } + if (q->c_char) { + free(q->c_char); + } bzero(q, sizeof(srslte_sequence_t)); } diff --git a/lib/src/phy/fec/rm_turbo.c b/lib/src/phy/fec/rm_turbo.c index 3e06a9de2..0e83cda5b 100644 --- a/lib/src/phy/fec/rm_turbo.c +++ b/lib/src/phy/fec/rm_turbo.c @@ -43,13 +43,15 @@ #endif #ifdef LV_HAVE_SSE -#include -int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); +#include +int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); +int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); #endif #ifdef LV_HAVE_AVX -#include -int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); +#include +int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); +int srslte_rm_turbo_rx_lut_avx_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); #endif #define NCOLS 32 @@ -66,12 +68,31 @@ static uint16_t interleaver_parity_bits[192][2*6160]; static srslte_bit_interleaver_t bit_interleavers_parity_bits[192]; static uint16_t deinterleaver[192][4][18448]; static int k0_vec[SRSLTE_NOF_TC_CB_SIZES][4][2]; -static bool rm_turbo_tables_generated = false; - +static bool rm_turbo_tables_generated = false; + + +// Store deinterleaver version for sub-block turbo decoder +#if SRSLTE_TDEC_EXPECT_INPUT_SB == 1 +// Prepare bit for sub-block decoder processing. These are the nof subblock sizes +#define NOF_DEINTER_TABLE_SB_IDX 3 +const static int deinter_table_sb_idx[NOF_DEINTER_TABLE_SB_IDX] = {8, 16, 32}; +int deinter_table_idx_from_sb_len(uint32_t nof_subblocks) { + for (int i=0;i= out_len && inputCnt < in_len - 16) { + /* Copy last elements */ + if ((out_len%16) == 12) { + for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) { + output[deinter[j%out_len]] += input[j]; + inputCnt++; + } + } else { + for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) { + output[deinter[j%out_len]] += input[j]; + inputCnt++; + } + } + /* And wrap pointers */ + nwrapps++; + intCnt = 16; + xPtr = (const __m128i*) &input[nwrapps*out_len]; + lutPtr = (const __m128i*) deinter; + } + } + for (int i=inputCnt;i= out_len && inputCnt < in_len - 16) { /* Copy last elements */ if ((out_len%16) == 12) { - for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) { + for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) { output[deinter[j%out_len]] += input[j]; inputCnt++; } } else { - for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) { + for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) { output[deinter[j%out_len]] += input[j]; inputCnt++; } } /* And wrap pointers */ nwrapps++; - intCnt = 16; + intCnt = 16; xPtr = (const __m256i*) &input[nwrapps*out_len]; lutPtr = (const __m256i*) deinter; } - } - for (int i=inputCnt;i= out_len && inputCnt < in_len - 32) { + printf("warning rate matching wrapping remainder %d\n", out_len%32); + /* Copy last elements */ + for (int j=(nwrapps+1)*out_len-(out_len%32) ;j<(nwrapps+1)*out_len;j++) { + output[deinter[j%out_len]] += input[j]; + inputCnt++; + } + /* And wrap pointers */ + nwrapps++; + intCnt = 32; + xPtr = (const __m256i*) &input[nwrapps*out_len]; + lutPtr = (const __m256i*) deinter; + } + } + for (int i=inputCnt;iforward[i] = (uint32_t) j; h->reverse[j] = (uint32_t) i; } + if (interl_win != 1) { + uint16_t *f = malloc(long_cb*sizeof(uint16_t)); + uint16_t *r = malloc(long_cb*sizeof(uint16_t)); + memcpy(f, h->forward, long_cb*sizeof(uint16_t)); + memcpy(r, h->reverse, long_cb*sizeof(uint16_t)); + for (i = 0; i < long_cb; i++) { + h->forward[i] = deinter(f[inter(i,interl_win)],interl_win); + h->reverse[i] = deinter(r[inter(i,interl_win)],interl_win); + } + free(f); + free(r); + } + return 0; } diff --git a/lib/src/phy/fec/tc_interl_umts.c b/lib/src/phy/fec/tc_interl_umts.c index d7f9ecdb7..261b4d7d3 100644 --- a/lib/src/phy/fec/tc_interl_umts.c +++ b/lib/src/phy/fec/tc_interl_umts.c @@ -30,7 +30,6 @@ #include #include "srslte/phy/fec/tc_interl.h" -#include "srslte/phy/fec/turbocoder.h" #define TURBO_SRSLTE_TCOD_RATE 3 diff --git a/lib/src/phy/fec/test/rm_turbo_test.c b/lib/src/phy/fec/test/rm_turbo_test.c index c8a0a95b5..f911da1d2 100644 --- a/lib/src/phy/fec/test/rm_turbo_test.c +++ b/lib/src/phy/fec/test/rm_turbo_test.c @@ -183,7 +183,7 @@ int main(int argc, char **argv) { srslte_rm_turbo_rx(buff_f, BUFFSZ, rm_bits_f, nof_e_bits, bits_f, long_cb_enc, rv_idx, 0); bzero(bits2_s, long_cb_enc*sizeof(short)); - srslte_rm_turbo_rx_lut(rm_bits_s, bits2_s, nof_e_bits, cb_idx, rv_idx); + srslte_rm_turbo_rx_lut_(rm_bits_s, bits2_s, nof_e_bits, cb_idx, rv_idx, false); for (int i=0;iorder - ((last_cb) ? crc_tb->order : 0)) / 8; /* if CRC pointer is given */ - for (int i = 0; i < (long_cb - crc->order) / 8; i++) { + for (int i = 0; i < block_size_nocrc; i++) { uint8_t in = input[i]; - /* Put byte in CRC and save latest checksum */ - srslte_crc_checksum_put_byte(crc, in); + /* Put byte in TB CRC and save latest checksum */ + srslte_crc_checksum_put_byte(crc_tb, in); + + /* Put byte in CB CRC and save latest checksum */ + srslte_crc_checksum_put_byte(crc_cb, in); /* Run actual encoder */ tcod_lut_t l = tcod_lut[state0][in]; @@ -225,10 +235,27 @@ int srslte_tcod_encode_lut(srslte_tcod_t *h, srslte_crc_t *crc, uint8_t *input, state0 = l.next_state; } - uint32_t checksum = (uint32_t) srslte_crc_checksum_get(crc); - for (int i = 0; i < crc->order / 8; i++) { - int mask_shift = 8 * (crc->order / 8 - i - 1); - int idx = (long_cb - crc->order) / 8 + i; + if (last_cb) { + uint32_t checksum = (uint32_t) srslte_crc_checksum_get(crc_tb); + for (int i = 0; i < crc_tb->order / 8; i++) { + int mask_shift = 8 * (crc_tb->order / 8 - i - 1); + int idx = block_size_nocrc + i; + uint8_t in = (uint8_t) ((checksum >> mask_shift) & 0xff); + + /* Put byte in CB CRC and save latest checksum */ + srslte_crc_checksum_put_byte(crc_cb, in); + + input[idx] = in; + tcod_lut_t l = tcod_lut[state0][in]; + parity[idx] = l.output; + state0 = l.next_state; + } + } + + uint32_t checksum = (uint32_t) srslte_crc_checksum_get(crc_cb); + for (int i = 0; i < crc_cb->order / 8; i++) { + int mask_shift = 8 * (crc_cb->order / 8 - i - 1); + int idx = (long_cb - crc_cb->order) / 8 + i; uint8_t in = (uint8_t) ((checksum >> mask_shift) & 0xff); input[idx] = in; @@ -239,11 +266,31 @@ int srslte_tcod_encode_lut(srslte_tcod_t *h, srslte_crc_t *crc, uint8_t *input, } else { /* No CRC given */ - for (uint32_t i = 0; i < long_cb / 8; i++) { - tcod_lut_t l = tcod_lut[state0][input[i]]; + int block_size_nocrc = (long_cb - ((last_cb) ? crc_tb->order : 0)) / 8; + + for (uint32_t i = 0; i < block_size_nocrc; i++) { + uint8_t in = input[i]; + + srslte_crc_checksum_put_byte(crc_tb, in); + + tcod_lut_t l = tcod_lut[state0][in]; parity[i] = l.output; state0 = l.next_state; } + + if (last_cb) { + uint32_t checksum = (uint32_t) srslte_crc_checksum_get(crc_tb); + for (int i = 0; i < crc_tb->order / 8; i++) { + int mask_shift = 8 * (crc_tb->order / 8 - i - 1); + int idx = block_size_nocrc + i; + uint8_t in = (uint8_t) ((checksum >> mask_shift) & 0xff); + + input[idx] = in; + tcod_lut_t l = tcod_lut[state0][in]; + parity[idx] = l.output; + state0 = l.next_state; + } + } } parity[long_cb / 8] = 0; // will put tail here later diff --git a/lib/src/phy/fec/turbodecoder.c b/lib/src/phy/fec/turbodecoder.c index 54a0ed109..422175357 100644 --- a/lib/src/phy/fec/turbodecoder.c +++ b/lib/src/phy/fec/turbodecoder.c @@ -24,151 +24,543 @@ * */ - -#include #include -#include #include +#include +#include +#include +#include "srslte/phy/utils/vector.h" #include "srslte/phy/fec/turbodecoder.h" + +#define debug_enabled 0 + +/* Generic (no SSE) implementation */ #include "srslte/phy/fec/turbodecoder_gen.h" +srslte_tdec_16bit_impl_t gen_impl = { + tdec_gen_init, + tdec_gen_free, + tdec_gen_dec, + tdec_gen_extract_input, + tdec_gen_decision_byte +}; +/* SSE no-window implementation */ +#include "srslte/phy/fec/turbodecoder_sse.h" +srslte_tdec_16bit_impl_t sse_impl = { + tdec_sse_init, + tdec_sse_free, + tdec_sse_dec, + tdec_sse_extract_input, + tdec_sse_decision_byte +}; +/* SSE window implementation */ #ifdef LV_HAVE_SSE -#include "srslte/phy/fec/turbodecoder_simd.h" +#define WINIMP_IS_SSE16 +#include "srslte/phy/fec/turbodecoder_win.h" +#undef WINIMP_IS_SSE16 + +srslte_tdec_16bit_impl_t sse16_win_impl = { + tdec_winsse16_init, + tdec_winsse16_free, + tdec_winsse16_dec, + tdec_winsse16_extract_input, + tdec_winsse16_decision_byte +}; #endif -#include "srslte/phy/utils/vector.h" +/* AVX window implementation */ +#ifdef LV_HAVE_AVX +#define WINIMP_IS_AVX16 +#include "srslte/phy/fec/turbodecoder_win.h" +#undef WINIMP_IS_AVX16 +srslte_tdec_16bit_impl_t avx16_win_impl = { + tdec_winavx16_init, + tdec_winavx16_free, + tdec_winavx16_dec, + tdec_winavx16_extract_input, + tdec_winavx16_decision_byte +}; +#endif + +/* SSE window implementation */ +#ifdef LV_HAVE_SSE +#define WINIMP_IS_SSE8 +#include "srslte/phy/fec/turbodecoder_win.h" +#undef WINIMP_IS_SSE8 + +srslte_tdec_8bit_impl_t sse8_win_impl = { + tdec_winsse8_init, + tdec_winsse8_free, + tdec_winsse8_dec, + tdec_winsse8_extract_input, + tdec_winsse8_decision_byte +}; +#endif + +/* AVX window implementation */ +#ifdef LV_HAVE_AVX +#define WINIMP_IS_AVX8 +#include "srslte/phy/fec/turbodecoder_win.h" +#undef WINIMP_IS_AVX8 +srslte_tdec_8bit_impl_t avx8_win_impl = { + tdec_winavx8_init, + tdec_winavx8_free, + tdec_winavx8_dec, + tdec_winavx8_extract_input, + tdec_winavx8_decision_byte +}; +#endif + +#define AUTO_16_SSE 0 +#define AUTO_16_SSEWIN 1 +#define AUTO_16_AVXWIN 2 +#define AUTO_8_SSEWIN 0 +#define AUTO_8_AVXWIN 1 + + +// Include interfaces for 8 and 16 bit decoder implementations +#define LLR_IS_8BIT +#include "srslte/phy/fec/turbodecoder_iter.h" +#undef LLR_IS_8BIT + +#define LLR_IS_16BIT +#include "srslte/phy/fec/turbodecoder_iter.h" +#undef LLR_IS_16BIT int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) { -#ifdef LV_HAVE_SSE - return srslte_tdec_simd_init(&h->tdec_simd, SRSLTE_TDEC_MAX_NPAR, max_long_cb); -#else - h->input_conv = srslte_vec_malloc(sizeof(float) * (3*max_long_cb+12)); + return srslte_tdec_init_manual(h, max_long_cb, SRSLTE_TDEC_AUTO); +} + +uint32_t interleaver_idx(uint32_t nof_subblocks) { + switch (nof_subblocks) { + case 32: + return 3; + case 16: + return 2; + case 8: + return 1; + case 1: + return 0; + default: + return 0; + } +} + +/* Initializes the turbo decoder object */ +int srslte_tdec_init_manual(srslte_tdec_t * h, uint32_t max_long_cb, srslte_tdec_impl_type_t dec_type) +{ + int ret = -1; + bzero(h, sizeof(srslte_tdec_t)); + uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL; + + h->dec_type = dec_type; + + // Set manual + switch(dec_type) { + case SRSLTE_TDEC_AUTO: + break; + case SRSLTE_TDEC_SSE: + h->dec16[0] = &sse_impl; + h->current_llr_type = SRSLTE_TDEC_16; + break; + case SRSLTE_TDEC_SSE_WINDOW: + h->dec16[0] = &sse16_win_impl; + h->current_llr_type = SRSLTE_TDEC_16; + break; + case SRSLTE_TDEC_GENERIC: + h->dec16[0] = &gen_impl; + h->current_llr_type = SRSLTE_TDEC_16; + break; + case SRSLTE_TDEC_SSE8_WINDOW: + h->dec8[0] = &sse8_win_impl; + h->current_llr_type = SRSLTE_TDEC_8; + break; +#ifdef LV_HAVE_AVX + case SRSLTE_TDEC_AVX_WINDOW: + h->dec16[0] = &avx16_win_impl; + h->current_llr_type = SRSLTE_TDEC_16; + break; + case SRSLTE_TDEC_AVX8_WINDOW: + h->dec8[0] = &avx8_win_impl; + h->current_llr_type = SRSLTE_TDEC_8; + break; +#endif + default: + fprintf(stderr, "Error decoder %d not supported\n", dec_type); + goto clean_and_exit; + } + + h->max_long_cb = max_long_cb; + + h->app1 = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->app1) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->app2 = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->app2) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->ext1 = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->ext1) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->ext2 = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->ext2) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->syst0 = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->syst0) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->parity0) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len); + if (!h->parity1) { + perror("srslte_vec_malloc"); + goto clean_and_exit; + } + h->input_conv = srslte_vec_malloc(sizeof(int16_t) * (len * 3+32*3)); if (!h->input_conv) { - perror("malloc"); - return -1; + perror("srslte_vec_malloc"); + goto clean_and_exit; } - return srslte_tdec_gen_init(&h->tdec_gen, max_long_cb); + + if (dec_type == SRSLTE_TDEC_AUTO) { + h->dec16[AUTO_16_SSE] = &sse_impl; + h->dec16[AUTO_16_SSEWIN] = &sse16_win_impl; + h->dec8[AUTO_8_SSEWIN] = &sse8_win_impl; +#ifdef LV_HAVE_AVX + h->dec16[AUTO_16_AVXWIN] = &avx16_win_impl; + h->dec8[AUTO_8_AVXWIN] = &avx8_win_impl; #endif + + for (int td=0;tddec16[td]) { + if ((h->nof_blocks16[td] = h->dec16[td]->tdec_init(&h->dec16_hdlr[td], h->max_long_cb))<0) { + goto clean_and_exit; + } + } + } + for (int td=0;tddec8[td]) { + if ((h->nof_blocks8[td] = h->dec8[td]->tdec_init(&h->dec8_hdlr[td], h->max_long_cb))<0) { + goto clean_and_exit; + } + } + } + + // Compute 1 interleaver for each possible nof_subblocks (1, 8, 16 or 32) + for (int s=0;s<4;s++) { + for (int i=0;iinterleaver[s][i], srslte_cbsegm_cbsize(i)) < 0) { + goto clean_and_exit; + } + srslte_tc_interl_LTE_gen_interl(&h->interleaver[s][i], srslte_cbsegm_cbsize(i), s?(8<<(s-1)):1); + } + } + } else { + uint32_t nof_subblocks; + if (dec_type < SRSLTE_TDEC_SSE8_WINDOW) { + if ((h->nof_blocks16[0] = h->dec16[0]->tdec_init(&h->dec16_hdlr[0], h->max_long_cb))<0) { + goto clean_and_exit; + } + nof_subblocks = h->nof_blocks16[0]; + } else { + if ((h->nof_blocks8[0] = h->dec8[0]->tdec_init(&h->dec8_hdlr[0], h->max_long_cb))<0) { + goto clean_and_exit; + } + nof_subblocks = h->nof_blocks8[0]; + } + for (int i=0;iinterleaver[interleaver_idx(nof_subblocks)][i], srslte_cbsegm_cbsize(i)) < 0) { + goto clean_and_exit; + } + srslte_tc_interl_LTE_gen_interl(&h->interleaver[interleaver_idx(nof_subblocks)][i], srslte_cbsegm_cbsize(i), nof_subblocks); + } + } + + h->current_cbidx = -1; + ret = 0; + +clean_and_exit: + if (ret == -1) { + srslte_tdec_free(h); + } + return ret; } -void srslte_tdec_free(srslte_tdec_t * h) { -#ifdef LV_HAVE_SSE - srslte_tdec_simd_free(&h->tdec_simd); -#else +void srslte_tdec_free(srslte_tdec_t * h) +{ + if (h->app1) { + free(h->app1); + } + if (h->app2) { + free(h->app2); + } + if (h->ext1) { + free(h->ext1); + } + if (h->ext2) { + free(h->ext2); + } + if (h->syst0) { + free(h->syst0); + } + if (h->parity0) { + free(h->parity0); + } + if (h->parity1) { + free(h->parity1); + } if (h->input_conv) { free(h->input_conv); } - srslte_tdec_gen_free(&h->tdec_gen); -#endif -} + for (int td=0;tddec8[td] && h->dec8_hdlr[td]) { + h->dec8[td]->tdec_free(h->dec8_hdlr[td]); + } + } + for (int td=0;tddec16[td] && h->dec16_hdlr[td]) { + h->dec16[td]->tdec_free(h->dec16_hdlr[td]); + } + } + for (int s=0;s<4;s++) { + for (int i=0;iinterleaver[s][i]); + } + } -int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) { -#ifdef LV_HAVE_SSE - return srslte_tdec_simd_reset(&h->tdec_simd, long_cb); -#else - return srslte_tdec_gen_reset(&h->tdec_gen, long_cb); -#endif + bzero(h, sizeof(srslte_tdec_t)); } -int srslte_tdec_reset_cb(srslte_tdec_t * h, uint32_t cb_idx) { -#ifdef LV_HAVE_SSE - return srslte_tdec_simd_reset_cb(&h->tdec_simd, cb_idx); -#else - return srslte_tdec_gen_reset(&h->tdec_gen, h->tdec_gen.current_cb_len); -#endif +void srslte_tdec_force_not_sb(srslte_tdec_t *h) { + h->force_not_sb = true; } -int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h, uint32_t cb_idx) +static void tdec_decision_byte(srslte_tdec_t * h, uint8_t *output) { -#ifdef LV_HAVE_SSE - return srslte_tdec_simd_get_nof_iterations_cb(&h->tdec_simd, cb_idx); -#else - return h->tdec_gen.n_iter; -#endif + if (h->current_llr_type == SRSLTE_TDEC_16) { + h->dec16[h->current_dec]->tdec_decision_byte(!(h->n_iter%2)?h->app1:h->ext1, output, h->current_long_cb); + } else { + h->dec8[h->current_dec]->tdec_decision_byte(!(h->n_iter%2)?(int8_t*)h->app1:(int8_t*)h->ext1, output, h->current_long_cb); + } } -void srslte_tdec_iteration_par(srslte_tdec_t * h, int16_t* input[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb) { -#ifdef LV_HAVE_SSE - srslte_tdec_simd_iteration(&h->tdec_simd, input, long_cb); -#else - srslte_vec_convert_if(input[0], 0.01, h->input_conv, 3*long_cb+12); - srslte_tdec_gen_iteration(&h->tdec_gen, h->input_conv, long_cb); + +/* Returns number of subblocks in automatic mode for this long_cb */ +uint32_t srslte_tdec_autoimp_get_subblocks(uint32_t long_cb) +{ +#ifdef LV_HAVE_AVX + if (!(long_cb%16) && long_cb > 800) { + return 16; + } else #endif + if (!(long_cb%8) && long_cb > 400) { + return 8; + } else { + return 0; + } } -void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) { - int16_t *input_par[SRSLTE_TDEC_MAX_NPAR]; - input_par[0] = input; - return srslte_tdec_iteration_par(h, input_par, long_cb); +static int tdec_sb_idx(uint32_t long_cb) { + uint32_t nof_sb = srslte_tdec_autoimp_get_subblocks(long_cb); + switch(nof_sb) { + case 16: + return AUTO_16_AVXWIN; + case 8: + return AUTO_16_SSEWIN; + case 0: + return AUTO_16_SSE; + } + fprintf(stderr, "Error in tdec_sb_idx() invalid nof_sb=%d\n", nof_sb); + return 0; } -void srslte_tdec_decision_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb) { -#ifdef LV_HAVE_SSE - return srslte_tdec_simd_decision(&h->tdec_simd, output, long_cb); -#else - return srslte_tdec_gen_decision(&h->tdec_gen, output[0], long_cb); +uint32_t srslte_tdec_autoimp_get_subblocks_8bit(uint32_t long_cb) +{ +#ifdef LV_HAVE_AVX + if (!(long_cb%32) && long_cb > 2048) { + return 32; + } else #endif + if (!(long_cb%16) && long_cb > 800) { + return 16; + } else if (!(long_cb%8) && long_cb > 400) { + return 8; + } else { + return 0; + } } -uint32_t srslte_tdec_get_nof_parallel(srslte_tdec_t *h) { -#ifdef LV_HAVE_AVX2 - return 2; -#else - return 1; -#endif +static int tdec_sb_idx_8(uint32_t long_cb) { + uint32_t nof_sb = srslte_tdec_autoimp_get_subblocks_8bit(long_cb); + switch(nof_sb) { + case 32: + return AUTO_8_AVXWIN; + case 16: + return AUTO_8_SSEWIN; + case 8: + return 10+AUTO_16_SSEWIN; + case 0: + return 10+AUTO_16_SSE; + } + fprintf(stderr, "Error in tdec_sb_idx_8() invalid nof_sb=%d\n", nof_sb); + return 0; } -void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { - uint8_t *output_par[SRSLTE_TDEC_MAX_NPAR]; - output_par[0] = output; - srslte_tdec_decision_par(h, output_par, long_cb); +// TODO: Implement SSE version. Don't really a problem since this only called at very low rates +static void convert_8_to_16(int8_t *in, int16_t *out, uint32_t len) +{ + for (int i=0;itdec_simd, output, long_cb); -#else - srslte_tdec_gen_decision_byte(&h->tdec_gen, output[0], long_cb); -#endif +static void convert_16_to_8(int16_t *in, int8_t *out, uint32_t len) +{ + for (int i=0;itdec_simd, output, cb_idx, long_cb); -#else - srslte_tdec_gen_decision_byte(&h->tdec_gen, output, long_cb); -#endif +static void tdec_iteration_8(srslte_tdec_t * h, int8_t * input) +{ + // Select decoder if in auto mode + if (h->dec_type == SRSLTE_TDEC_AUTO) { + h->current_llr_type = SRSLTE_TDEC_8; + h->current_dec = tdec_sb_idx_8(h->current_long_cb); + h->current_inter_idx = interleaver_idx(h->nof_blocks8[h->current_dec]); + + // If long_cb is not multiple of any 8-bit decoder, use a 16-bit decoder and do type conversion + if (h->current_dec >= 10) { + h->current_llr_type = SRSLTE_TDEC_16; + h->current_dec -= 10; + h->current_inter_idx = interleaver_idx(h->nof_blocks16[h->current_dec]); + } + } else { + h->current_dec = 0; + } + + if (h->current_llr_type == SRSLTE_TDEC_16) { + if (!h->n_iter) { + convert_8_to_16(input, h->input_conv, 3*h->current_long_cb+12); + } + run_tdec_iteration_16bit(h, h->input_conv); + } else { + run_tdec_iteration_8bit(h, input); + } +} + +static void tdec_iteration_16(srslte_tdec_t * h, int16_t * input) +{ + // Select decoder if in auto mode + if (h->dec_type == SRSLTE_TDEC_AUTO) { + h->current_llr_type = SRSLTE_TDEC_16; + h->current_dec = tdec_sb_idx(h->current_long_cb); + } else { + h->current_dec = 0; + } + h->current_inter_idx = interleaver_idx(h->nof_blocks16[h->current_dec]); + + if (h->current_llr_type == SRSLTE_TDEC_8) { + + h->current_inter_idx = interleaver_idx(h->nof_blocks8[h->current_dec]); + + if (!h->n_iter) { + convert_16_to_8(input, h->input_conv, 3*h->current_long_cb+12); + } + run_tdec_iteration_8bit(h, h->input_conv); + } else { + run_tdec_iteration_16bit(h, input); + } +} + +/* Resets the decoder and sets the codeblock length */ +int srslte_tdec_new_cb(srslte_tdec_t * h, uint32_t long_cb) +{ + if (long_cb > h->max_long_cb) { + fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n", + h->max_long_cb); + return -1; + } + + h->n_iter = 0; + h->current_long_cb = long_cb; + h->current_cbidx = srslte_cbsegm_cbindex(long_cb); + if (h->current_cbidx < 0) { + fprintf(stderr, "Invalid CB length %d\n", long_cb); + return -1; + } + return 0; } -void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { - uint8_t *output_par[SRSLTE_TDEC_MAX_NPAR]; - output_par[0] = output; - srslte_tdec_decision_byte_par(h, output_par, long_cb); +void srslte_tdec_iteration(srslte_tdec_t * h, int16_t * input, uint8_t *output) +{ + if (h->current_cbidx >= 0) { + tdec_iteration_16(h, input); + tdec_decision_byte(h, output); + } } -int srslte_tdec_run_all_par(srslte_tdec_t * h, int16_t * input[SRSLTE_TDEC_MAX_NPAR], - uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t nof_iterations, uint32_t long_cb) { -#ifdef LV_HAVE_SSE - return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, long_cb); -#else - srslte_vec_convert_if(input[0], 0.01, h->input_conv, 3*long_cb+12); - return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output[0], nof_iterations, long_cb); -#endif +/* Runs nof_iterations iterations and decides the output bits */ +int srslte_tdec_run_all(srslte_tdec_t * h, int16_t * input, uint8_t *output, + uint32_t nof_iterations, uint32_t long_cb) +{ + if (srslte_tdec_new_cb(h, long_cb)) { + return SRSLTE_ERROR; + } + + do { + tdec_iteration_16(h, input); + } while (h->n_iter < nof_iterations); + + tdec_decision_byte(h, output); + + return SRSLTE_SUCCESS; +} + + +void srslte_tdec_iteration_8bit(srslte_tdec_t * h, int8_t * input, uint8_t *output) +{ + if (h->current_cbidx >= 0) { + tdec_iteration_8(h, input); + tdec_decision_byte(h, output); + } +} + +/* Runs nof_iterations iterations and decides the output bits */ +int srslte_tdec_run_all_8bit(srslte_tdec_t * h, int8_t * input, uint8_t *output, + uint32_t nof_iterations, uint32_t long_cb) +{ + if (srslte_tdec_new_cb(h, long_cb)) { + return SRSLTE_ERROR; + } + + do { + tdec_iteration_8(h, input); + } while (h->n_iter < nof_iterations); + + tdec_decision_byte(h, output); + + return SRSLTE_SUCCESS; } -int srslte_tdec_run_all(srslte_tdec_t * h, int16_t * input, uint8_t *output, uint32_t nof_iterations, uint32_t long_cb) +int srslte_tdec_get_nof_iterations(srslte_tdec_t * h) { - uint8_t *output_par[SRSLTE_TDEC_MAX_NPAR]; - output_par[0] = output; - int16_t *input_par[SRSLTE_TDEC_MAX_NPAR]; - input_par[0] = input; - - return srslte_tdec_run_all_par(h, input_par, output_par, nof_iterations, long_cb); + return h->n_iter; } diff --git a/lib/src/phy/fec/turbodecoder_avx.c b/lib/src/phy/fec/turbodecoder_avx.c deleted file mode 100644 index 2e877cbde..000000000 --- a/lib/src/phy/fec/turbodecoder_avx.c +++ /dev/null @@ -1,475 +0,0 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -#include -#include -#include -#include -#include -#include - -#include "srslte/phy/fec/turbodecoder_simd.h" -#include "srslte/phy/utils/vector.h" - -#include - -#define NUMSTATES 8 -#define NINPUTS 2 -#define TAIL 3 -#define TOTALTAIL 12 - -#define INF 10000 -#define ZERO 0 - - -#ifdef LV_HAVE_AVX2 - -#include -#include - - -// Number of CB processed in parllel in AVX -#define NCB 2 - -/* -static void print_256i(__m256i x) { - int16_t *s = (int16_t*) &x; - printf("[%d", s[0]); - for (int i=1;i<16;i++) { - printf(",%d", s[i]); - } - printf("]\n"); -} -*/ - -/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */ -static inline int16_t hMax0(__m256i masked_value) -{ - __m128i tmp1 = _mm256_extractf128_si256(masked_value, 0); - __m128i tmp3 = _mm_minpos_epu16(tmp1); - return (int16_t)(_mm_cvtsi128_si32(tmp3)); -} - -static inline int16_t hMax1(__m256i masked_value) -{ - __m128i tmp1 = _mm256_extractf128_si256(masked_value, 1); - __m128i tmp3 = _mm_minpos_epu16(tmp1); - return (int16_t)(_mm_cvtsi128_si32(tmp3)); -} - -/* Computes beta values */ -void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb) -{ - int k; - uint32_t end = long_cb + 3; - const __m256i *alphaPtr = (const __m256i*) s->alpha; - - __m256i beta_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); - __m256i g, bp, bn, alpha_k; - - /* Define the shuffle constant for the positive beta */ - __m256i shuf_bp = _mm256_set_epi8( - // 1st CB - 15+16, 14+16, // 7 - 7+16, 6+16, // 3 - 5+16, 4+16, // 2 - 13+16, 12+16, // 6 - 11+16, 10+16, // 5 - 3+16, 2+16, // 1 - 1+16, 0+16, // 0 - 9+16, 8+16, // 4 - - // 2nd CB - 15, 14, // 7 - 7, 6, // 3 - 5, 4, // 2 - 13, 12, // 6 - 11, 10, // 5 - 3, 2, // 1 - 1, 0, // 0 - 9, 8 // 4 - ); - - /* Define the shuffle constant for the negative beta */ - __m256i shuf_bn = _mm256_set_epi8( - 7+16, 6+16, // 3 - 15+16, 14+16, // 7 - 13+16, 12+16, // 6 - 5+16, 4+16, // 2 - 3+16, 2+16, // 1 - 11+16, 10+16, // 5 - 9+16, 8+16, // 4 - 1+16, 0+16, // 0 - - 7, 6, // 3 - 15, 14, // 7 - 13, 12, // 6 - 5, 4, // 2 - 3, 2, // 1 - 11, 10, // 5 - 9, 8, // 4 - 1, 0 // 0 - ); - - alphaPtr += long_cb-1; - - /* Define shuffle for branch costs */ - __m256i shuf_g[4]; - shuf_g[3] = _mm256_set_epi8(3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16, - 3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); - shuf_g[2] = _mm256_set_epi8(7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16, - 7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); - shuf_g[1] = _mm256_set_epi8(11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16, - 11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); - shuf_g[0] = _mm256_set_epi8(15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16, - 15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); - - /* Define shuffle for beta normalization */ - __m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); - - __m256i gv; - int16_t *b = &s->branch[2*NCB*long_cb-16]; - __m256i *gPtr = (__m256i*) b; - - /* This defines a beta computation step: - * Adds and substracts the branch metrics to the previous beta step, - * shuffles the states according to the trellis path and selects maximum state - */ -#define BETA_STEP(g) bp = _mm256_add_epi16(beta_k, g);\ - bn = _mm256_sub_epi16(beta_k, g);\ - bp = _mm256_shuffle_epi8(bp, shuf_bp);\ - bn = _mm256_shuffle_epi8(bn, shuf_bn);\ - beta_k = _mm256_max_epi16(bp, bn); - - /* Loads the alpha metrics from memory and adds them to the temporal bn and bp - * metrics. Then computes horizontal maximum of both metrics and computes difference - */ -#define BETA_STEP_CNT(c,d) g = _mm256_shuffle_epi8(gv, shuf_g[c]);\ - BETA_STEP(g)\ - alpha_k = _mm256_load_si256(alphaPtr);\ - alphaPtr--;\ - bp = _mm256_add_epi16(bp, alpha_k);\ - bn = _mm256_add_epi16(bn, alpha_k);\ - bn = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bn);\ - bp = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bp);\ - output[0][k-d] = hMax0(bn) - hMax0(bp);\ - output[1][k-d] = hMax1(bn) - hMax1(bp); - - /* The tail does not require to load alpha or produce outputs. Only update - * beta metrics accordingly */ - for (k=end-1; k>=long_cb; k--) { - int16_t g0_1 = s->branch[2*NCB*k]; - int16_t g1_1 = s->branch[2*NCB*k+1]; - int16_t g0_2 = s->branch[2*NCB*k+6]; - int16_t g1_2 = s->branch[2*NCB*k+6+1]; - g = _mm256_set_epi16(g1_2, g0_2, g0_2, g1_2, g1_2, g0_2, g0_2, g1_2, g1_1, g0_1, g0_1, g1_1, g1_1, g0_1, g0_1, g1_1); - BETA_STEP(g); - } - - /* We inline 2 trelis steps for each normalization */ - __m256i norm; - for (; k >= 0; k-=8) { - gv = _mm256_load_si256(gPtr); - gPtr--; - BETA_STEP_CNT(0,0); - BETA_STEP_CNT(1,1); - BETA_STEP_CNT(2,2); - BETA_STEP_CNT(3,3); - norm = _mm256_shuffle_epi8(beta_k, shuf_norm); - beta_k = _mm256_sub_epi16(beta_k, norm); - gv = _mm256_load_si256(gPtr); - gPtr--; - BETA_STEP_CNT(0,4); - BETA_STEP_CNT(1,5); - BETA_STEP_CNT(2,6); - BETA_STEP_CNT(3,7); - norm = _mm256_shuffle_epi8(beta_k, shuf_norm); - beta_k = _mm256_sub_epi16(beta_k, norm); - } -} - -/* Computes alpha metrics */ -void map_avx_alpha(map_gen_t * s, uint32_t long_cb) -{ - uint32_t k; - int16_t *alpha1 = s->alpha; - int16_t *alpha2 = &s->alpha[8]; - uint32_t i; - - alpha1[0] = 0; - alpha2[0] = 0; - for (i = 1; i < 8; i++) { - alpha1[i] = -INF; - alpha2[i] = -INF; - } - - /* Define the shuffle constant for the positive alpha */ - __m256i shuf_ap = _mm256_set_epi8( - - // 1st CB - 31, 30, // 7 - 25, 24, // 4 - 23, 22, // 3 - 17, 16, // 0 - 29, 28, // 6 - 27, 26, // 5 - 21, 20, // 2 - 19, 18, // 1 - - // 2nd CB - 15, 14, // 7 - 9, 8, // 4 - 7, 6, // 3 - 1, 0, // 0 - 13, 12, // 6 - 11, 10, // 5 - 5, 4, // 2 - 3, 2 // 1 - ); - - /* Define the shuffle constant for the negative alpha */ - __m256i shuf_an = _mm256_set_epi8( - - // 1nd CB - 29, 28, // 6 - 27, 26, // 5 - 21, 20, // 2 - 19, 18, // 1 - 31, 30, // 7 - 25, 24, // 4 - 23, 22, // 3 - 17, 16, // 0 - - // 2nd CB - 13, 12, // 6 - 11, 10, // 5 - 5, 4, // 2 - 3, 2, // 1 - 15, 14, // 7 - 9, 8, // 4 - 7, 6, // 3 - 1, 0 // 0 - ); - - /* Define shuffle for branch costs */ - __m256i shuf_g[4]; - shuf_g[0] = _mm256_set_epi8(3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16, - 3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2); - shuf_g[1] = _mm256_set_epi8(7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16, - 7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6); - shuf_g[2] = _mm256_set_epi8(11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16, - 11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10); - shuf_g[3] = _mm256_set_epi8(15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16, - 15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14); - - __m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); - - __m256i* alphaPtr = (__m256i*) s->alpha; - alphaPtr++; - - __m256i gv; - __m256i *gPtr = (__m256i*) s->branch; - __m256i g, ap, an; - - __m256i alpha_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); - - /* This defines a alpha computation step: - * Adds and substracts the branch metrics to the previous alpha step, - * shuffles the states according to the trellis path and selects maximum state - */ -#define ALPHA_STEP(c) g = _mm256_shuffle_epi8(gv, shuf_g[c]); \ - ap = _mm256_add_epi16(alpha_k, g);\ - an = _mm256_sub_epi16(alpha_k, g);\ - ap = _mm256_shuffle_epi8(ap, shuf_ap);\ - an = _mm256_shuffle_epi8(an, shuf_an);\ - alpha_k = _mm256_max_epi16(ap, an);\ - _mm256_store_si256(alphaPtr, alpha_k);\ - alphaPtr++;\ - - - /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */ - __m256i norm; - for (k = 0; k < long_cb/8; k++) { - gv = _mm256_load_si256(gPtr); - - gPtr++; - ALPHA_STEP(0); - ALPHA_STEP(1); - ALPHA_STEP(2); - ALPHA_STEP(3); - norm = _mm256_shuffle_epi8(alpha_k, shuf_norm); - alpha_k = _mm256_sub_epi16(alpha_k, norm); - gv = _mm256_load_si256(gPtr); - gPtr++; - ALPHA_STEP(0); - ALPHA_STEP(1); - ALPHA_STEP(2); - ALPHA_STEP(3); - norm = _mm256_shuffle_epi8(alpha_k, shuf_norm); - alpha_k = _mm256_sub_epi16(alpha_k, norm); - } -} - -void map_sse_gamma_single(int16_t *output, int16_t *input, int16_t *app, int16_t *parity) -{ - __m128i res00, res10, res01, res11, res0, res1; - __m128i in, ap, pa, g1, g0; - - __m128i *inPtr = (__m128i*) input; - __m128i *appPtr = (__m128i*) app; - __m128i *paPtr = (__m128i*) parity; - __m128i *resPtr = (__m128i*) output; - - __m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); - __m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); - __m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); - __m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); - - in = _mm_load_si128(inPtr); - inPtr++; - pa = _mm_load_si128(paPtr); - paPtr++; - - if (appPtr) { - ap = _mm_load_si128(appPtr); - appPtr++; - in = _mm_add_epi16(ap, in); - } - - g1 = _mm_add_epi16(in, pa); - g0 = _mm_sub_epi16(in, pa); - - g1 = _mm_srai_epi16(g1, 1); - g0 = _mm_srai_epi16(g0, 1); - - res00 = _mm_shuffle_epi8(g0, res00_mask); - res10 = _mm_shuffle_epi8(g0, res10_mask); - res01 = _mm_shuffle_epi8(g1, res01_mask); - res11 = _mm_shuffle_epi8(g1, res11_mask); - - res0 = _mm_or_si128(res00, res01); - res1 = _mm_or_si128(res10, res11); - - _mm_store_si128(resPtr, res0); - resPtr++; - _mm_store_si128(resPtr, res1); - resPtr++; -} - - -/* Compute branch metrics (gamma) */ -void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb) -{ - __m128i res10, res20, res11, res21, res1, res2; - __m256i in, ap, pa, g1, g0; - - __m256i *inPtr = (__m256i*) input; - __m256i *appPtr = (__m256i*) app; - __m256i *paPtr = (__m256i*) parity; - __m128i *resPtr = (__m128i*) h->branch; - - if (cbidx) { - resPtr++; - } - - __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); - __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); - - __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); - __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); - - for (int i=0;ibranch[2*i*NCB+cbidx*6] = (input[i] - parity[i])/2; - h->branch[2*i*NCB+cbidx*6+1] = (input[i] + parity[i])/2; - } -} - - -#endif - - diff --git a/lib/src/phy/fec/turbodecoder_gen.c b/lib/src/phy/fec/turbodecoder_gen.c index 649c3363a..4cabe05ce 100644 --- a/lib/src/phy/fec/turbodecoder_gen.c +++ b/lib/src/phy/fec/turbodecoder_gen.c @@ -1,400 +1,273 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -#include -#include -#include -#include -#include -#include - -#include "srslte/phy/fec/turbodecoder_gen.h" -#include "srslte/phy/utils/vector.h" - -#define NUMSTATES 8 -#define NINPUTS 2 -#define TAIL 3 -#define TOTALTAIL 12 - -#define INF 9e4 -#define ZERO 9e-4 - -/************************************************ - * - * MAP_GEN is the MAX-LOG-MAP generic implementation of the - * Decoder - * - ************************************************/ -static void map_gen_beta(srslte_map_gen_vl_t * s, float * input, float * parity, - uint32_t long_cb) -{ - float m_b[8], new[8], old[8]; - float x, y, xy; - int k; - uint32_t end = long_cb + SRSLTE_TCOD_RATE; - float *beta = s->beta; - uint32_t i; - - for (i = 0; i < 8; i++) { - old[i] = beta[8 * (end) + i]; - } - - for (k = end - 1; k >= 0; k--) { - x = input[k]; - y = parity[k]; - - xy = x + y; - - m_b[0] = old[4] + xy; - m_b[1] = old[4]; - m_b[2] = old[5] + y; - m_b[3] = old[5] + x; - m_b[4] = old[6] + x; - m_b[5] = old[6] + y; - m_b[6] = old[7]; - m_b[7] = old[7] + xy; - - new[0] = old[0]; - new[1] = old[0] + xy; - new[2] = old[1] + x; - new[3] = old[1] + y; - new[4] = old[2] + y; - new[5] = old[2] + x; - new[6] = old[3] + xy; - new[7] = old[3]; - - for (i = 0; i < 8; i++) { - if (m_b[i] > new[i]) - new[i] = m_b[i]; - old[i] = new[i]; - beta[8 * k + i] = old[i]; - } - } -} - -static void map_gen_alpha(srslte_map_gen_vl_t * s, float * input, float * parity, float * output, - uint32_t long_cb) -{ - float m_b[8], new[8], old[8], max1[8], max0[8]; - float m1, m0; - float x, y, xy; - float out; - uint32_t k; - uint32_t end = long_cb; - float *beta = s->beta; - uint32_t i; - - old[0] = 0; - for (i = 1; i < 8; i++) { - old[i] = -INF; - } - - for (k = 1; k < end + 1; k++) { - x = input[k - 1]; - y = parity[k - 1]; - - xy = x + y; - - m_b[0] = old[0]; - m_b[1] = old[3] + y; - m_b[2] = old[4] + y; - m_b[3] = old[7]; - m_b[4] = old[1]; - m_b[5] = old[2] + y; - m_b[6] = old[5] + y; - m_b[7] = old[6]; - - new[0] = old[1] + xy; - new[1] = old[2] + x; - new[2] = old[5] + x; - new[3] = old[6] + xy; - new[4] = old[0] + xy; - new[5] = old[3] + x; - new[6] = old[4] + x; - new[7] = old[7] + xy; - - for (i = 0; i < 8; i++) { - max0[i] = m_b[i] + beta[8 * k + i]; - max1[i] = new[i] + beta[8 * k + i]; - } - - m1 = max1[0]; - m0 = max0[0]; - - for (i = 1; i < 8; i++) { - if (max1[i] > m1) - m1 = max1[i]; - if (max0[i] > m0) - m0 = max0[i]; - } - - for (i = 0; i < 8; i++) { - if (m_b[i] > new[i]) - new[i] = m_b[i]; - old[i] = new[i]; - } - - out = m1 - m0; - output[k - 1] = out; - } -} - -static int map_gen_init(srslte_map_gen_vl_t * h, int max_long_cb) -{ - bzero(h, sizeof(srslte_map_gen_vl_t)); - h->beta = srslte_vec_malloc(sizeof(float) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); - if (!h->beta) { - perror("srslte_vec_malloc"); - return -1; - } - h->max_long_cb = max_long_cb; - return 0; -} - -static void map_gen_free(srslte_map_gen_vl_t * h) -{ - if (h->beta) { - free(h->beta); - } - bzero(h, sizeof(srslte_map_gen_vl_t)); -} - -static void map_gen_dec(srslte_map_gen_vl_t * h, float * input, float * parity, float * output, - uint32_t long_cb) -{ - uint32_t k; - - h->beta[(long_cb + TAIL) * NUMSTATES] = 0; - for (k = 1; k < NUMSTATES; k++) - h->beta[(long_cb + TAIL) * NUMSTATES + k] = -INF; - - map_gen_beta(h, input, parity, long_cb); - map_gen_alpha(h, input, parity, output, long_cb); -} - -/************************************************ - * - * TURBO DECODER INTERFACE - * - ************************************************/ -int srslte_tdec_gen_init(srslte_tdec_gen_t * h, uint32_t max_long_cb) -{ - int ret = -1; - bzero(h, sizeof(srslte_tdec_gen_t)); - uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL; - - h->max_long_cb = max_long_cb; - - h->llr1 = srslte_vec_malloc(sizeof(float) * len); - if (!h->llr1) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->llr2 = srslte_vec_malloc(sizeof(float) * len); - if (!h->llr2) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->w = srslte_vec_malloc(sizeof(float) * len); - if (!h->w) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->syst = srslte_vec_malloc(sizeof(float) * len); - if (!h->syst) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->parity = srslte_vec_malloc(sizeof(float) * len); - if (!h->parity) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - - if (map_gen_init(&h->dec, h->max_long_cb)) { - goto clean_and_exit; - } - - for (int i=0;iinterleaver[i], srslte_cbsegm_cbsize(i)) < 0) { - goto clean_and_exit; - } - srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i)); - } - h->current_cbidx = -1; - ret = 0; -clean_and_exit:if (ret == -1) { - srslte_tdec_gen_free(h); - } - return ret; -} - -void srslte_tdec_gen_free(srslte_tdec_gen_t * h) -{ - if (h->llr1) { - free(h->llr1); - } - if (h->llr2) { - free(h->llr2); - } - if (h->w) { - free(h->w); - } - if (h->syst) { - free(h->syst); - } - if (h->parity) { - free(h->parity); - } - - map_gen_free(&h->dec); - - for (int i=0;iinterleaver[i]); - } - - bzero(h, sizeof(srslte_tdec_gen_t)); -} - -void srslte_tdec_gen_iteration(srslte_tdec_gen_t * h, float * input, uint32_t long_cb) -{ - uint32_t i; - - if (h->current_cbidx >= 0) { - - uint16_t *inter = h->interleaver[h->current_cbidx].forward; - uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; - - // Prepare systematic and parity bits for MAP DEC #1 - for (i = 0; i < long_cb; i++) { - h->syst[i] = input[SRSLTE_TCOD_RATE * i] + h->w[i]; - h->parity[i] = input[SRSLTE_TCOD_RATE * i + 1]; - } - for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) { - h->syst[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb)]; - h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb) + 1]; - } - - // Run MAP DEC #1 - map_gen_dec(&h->dec, h->syst, h->parity, h->llr1, long_cb); - - // Prepare systematic and parity bits for MAP DEC #1 - for (i = 0; i < long_cb; i++) { - h->syst[i] = h->llr1[inter[i]] - - h->w[inter[i]]; - h->parity[i] = input[SRSLTE_TCOD_RATE * i + 2]; - } - for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) { - h->syst[i] = - input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + NINPUTS * (i - long_cb)]; - h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE - + NINPUTS * (i - long_cb) + 1]; - } - - // Run MAP DEC #2 - map_gen_dec(&h->dec, h->syst, h->parity, h->llr2, long_cb); - - //printf("llr2="); - //srslte_vec_fprint_f(stdout, h->llr2, long_cb); - - - // Update a-priori LLR from the last iteration - for (i = 0; i < long_cb; i++) { - h->w[i] += h->llr2[deinter[i]] - h->llr1[i]; - } - } else { - fprintf(stderr, "Error CB index not set (call srslte_tdec_gen_reset() first\n"); - } - - // Increase number of iterations - h->n_iter++; -} - -int srslte_tdec_gen_reset(srslte_tdec_gen_t * h, uint32_t long_cb) -{ - if (long_cb > h->max_long_cb) { - fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n", - h->max_long_cb); - return -1; - } - memset(h->w, 0, sizeof(float) * long_cb); - h->current_cbidx = srslte_cbsegm_cbindex(long_cb); - h->current_cb_len = long_cb; - if (h->current_cbidx < 0) { - fprintf(stderr, "Invalid CB length %d\n", long_cb); - return -1; - } - return 0; -} - -void srslte_tdec_gen_decision(srslte_tdec_gen_t * h, uint8_t *output, uint32_t long_cb) -{ - uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; - uint32_t i; - for (i = 0; i < long_cb; i++) { - output[i] = (h->llr2[deinter[i]] > 0) ? 1 : 0; - } -} - -void srslte_tdec_gen_decision_byte(srslte_tdec_gen_t * h, uint8_t *output, uint32_t long_cb) -{ - uint32_t i; - uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; - uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; - - // long_cb is always byte aligned - for (i = 0; i < long_cb/8; i++) { - uint8_t out0 = h->llr2[deinter[8*i+0]]>0?mask[0]:0; - uint8_t out1 = h->llr2[deinter[8*i+1]]>0?mask[1]:0; - uint8_t out2 = h->llr2[deinter[8*i+2]]>0?mask[2]:0; - uint8_t out3 = h->llr2[deinter[8*i+3]]>0?mask[3]:0; - uint8_t out4 = h->llr2[deinter[8*i+4]]>0?mask[4]:0; - uint8_t out5 = h->llr2[deinter[8*i+5]]>0?mask[5]:0; - uint8_t out6 = h->llr2[deinter[8*i+6]]>0?mask[6]:0; - uint8_t out7 = h->llr2[deinter[8*i+7]]>0?mask[7]:0; - - output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; - } -} - -int srslte_tdec_gen_run_all(srslte_tdec_gen_t * h, float * input, uint8_t *output, - uint32_t nof_iterations, uint32_t long_cb) -{ - uint32_t iter = 0; - - if (srslte_tdec_gen_reset(h, long_cb)) { - return SRSLTE_ERROR; - } - - do { - srslte_tdec_gen_iteration(h, input, long_cb); - iter++; - } while (iter < nof_iterations); - - srslte_tdec_gen_decision_byte(h, output, long_cb); - - return SRSLTE_SUCCESS; -} +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "srslte/phy/fec/turbodecoder_gen.h" +#include "srslte/phy/utils/vector.h" + +#define NUMSTATES 8 +#define NINPUTS 2 +#define TAIL 3 +#define TOTALTAIL 12 + +#define INF 10000 + +#define debug_enabled 0 + +#if debug_enabled +#define debug_state printf("k=%5d, in=%5d, pa=%3d, out=%5d, alpha=", k, x, parity[k-1], out); srslte_vec_fprint_s(stdout, alpha, 8); \ + printf(", beta="); srslte_vec_fprint_s(stdout, &beta[8*(k)], 8); printf("\n"); +#else +#define debug_state +#endif + +/************************************************ + * + * MAP_GEN is the MAX-LOG-MAP generic implementation of the + * Decoder + * + ************************************************/ +static void map_gen_beta(tdec_gen_t *s, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) +{ + int16_t m_b[8], new[8], old[8]; + int16_t x, y, xy; + int k; + uint32_t end = long_cb + SRSLTE_TCOD_RATE; + int16_t *beta = s->beta; + uint32_t i; + + for (i = 0; i < 8; i++) { + old[i] = beta[8 * (end) + i]; + } + + for (k = end - 1; k >= 0; k--) { + x = input[k]; + if (app && k new[i]) + new[i] = m_b[i]; + old[i] = new[i]; + beta[8 * k + i] = old[i]; + } + + if ((k%4)==0 && k < long_cb) { + for (i = 1; i < 8; i++) { + old[i] -= old[0]; + } + old[0] = 0; + } + } +} + +static void map_gen_alpha(tdec_gen_t *s, int16_t *input, int16_t *app, int16_t *parity, int16_t *output, uint32_t long_cb) +{ + int16_t m_b[8], new[8], old[8], max1[8], max0[8]; + int16_t m1, m0; + int16_t x, y, xy; + int16_t out; + uint32_t k; + uint32_t end = long_cb; + int16_t *beta = s->beta; + uint32_t i; + + old[0] = 0; + for (i = 1; i < 8; i++) { + old[i] = -INF; + } + +#if debug_enabled + int16_t alpha[8]; +#endif + + for (k = 1; k < end + 1; k++) { + x = input[k - 1]; + if (app) { + x += app[k - 1]; + } + y = parity[k - 1]; + + xy = x + y; + +#if debug_enabled + memcpy(alpha, old, sizeof(int16_t)*8); +#endif + + m_b[0] = old[0]; + m_b[1] = old[3] + y; + m_b[2] = old[4] + y; + m_b[3] = old[7]; + m_b[4] = old[1]; + m_b[5] = old[2] + y; + m_b[6] = old[5] + y; + m_b[7] = old[6]; + + new[0] = old[1] + xy; + new[1] = old[2] + x; + new[2] = old[5] + x; + new[3] = old[6] + xy; + new[4] = old[0] + xy; + new[5] = old[3] + x; + new[6] = old[4] + x; + new[7] = old[7] + xy; + + for (i = 0; i < 8; i++) { + max0[i] = m_b[i] + beta[8 * k + i]; + max1[i] = new[i] + beta[8 * k + i]; + } + + m1 = max1[0]; + m0 = max0[0]; + + for (i = 1; i < 8; i++) { + if (max1[i] > m1) + m1 = max1[i]; + if (max0[i] > m0) + m0 = max0[i]; + } + for (i = 0; i < 8; i++) { + if (m_b[i] > new[i]) + new[i] = m_b[i]; + old[i] = new[i]; } + + if ((k%4)==0) { + for (i = 1; i < 8; i++) { + old[i] -= old[0]; + } + old[0] = 0; + } + + out = m1 - m0; + output[k - 1] = out; + + debug_state; + + } +} + +int tdec_gen_init(void **hh, uint32_t max_long_cb) +{ + *hh = calloc(1, sizeof(tdec_gen_t)); + + tdec_gen_t *h = (tdec_gen_t*) *hh; + + h->beta = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES); + if (!h->beta) { + perror("srslte_vec_malloc"); + return -1; + } + h->max_long_cb = max_long_cb; + return 1; +} + +void tdec_gen_free(void *hh) +{ + tdec_gen_t *h = (tdec_gen_t*) hh; + if (h->beta) { + free(h->beta); + } + bzero(h, sizeof(tdec_gen_t)); +} + +void tdec_gen_dec(void *hh, int16_t *input, int16_t *app, int16_t *parity, int16_t *output, uint32_t long_cb) +{ + tdec_gen_t *h = (tdec_gen_t*) hh; + + h->beta[(long_cb + TAIL) * NUMSTATES] = 0; + for (uint32_t k = 1; k < NUMSTATES; k++) + h->beta[(long_cb + TAIL) * NUMSTATES + k] = -INF; + + map_gen_beta(h, input, app, parity, long_cb); + map_gen_alpha(h, input, app, parity, output, long_cb); +} + +void tdec_gen_extract_input(int16_t *input, int16_t *syst, int16_t *app2, int16_t *parity0, int16_t *parity1, uint32_t long_cb) +{ + // Prepare systematic and parity bits for MAP DEC #1 + for (uint32_t i = 0; i < long_cb; i++) { + syst[i] = input[SRSLTE_TCOD_RATE * i]; + parity0[i] = input[SRSLTE_TCOD_RATE * i + 1]; + parity1[i] = input[SRSLTE_TCOD_RATE * i + 2]; + } + for (uint32_t i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) { + syst[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb)]; + parity0[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb) + 1]; + + app2[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + NINPUTS * (i - long_cb)]; + parity1[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + + NINPUTS * (i - long_cb) + 1]; + } + +} + +void tdec_gen_decision_byte(int16_t *app1, uint8_t *output, uint32_t long_cb) +{ + uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; + + // long_cb is always byte aligned + for (uint32_t i = 0; i < long_cb/8; i++) { + uint8_t out0 = app1[8*i+0]>0?mask[0]:0; + uint8_t out1 = app1[8*i+1]>0?mask[1]:0; + uint8_t out2 = app1[8*i+2]>0?mask[2]:0; + uint8_t out3 = app1[8*i+3]>0?mask[3]:0; + uint8_t out4 = app1[8*i+4]>0?mask[4]:0; + uint8_t out5 = app1[8*i+5]>0?mask[5]:0; + uint8_t out6 = app1[8*i+6]>0?mask[6]:0; + uint8_t out7 = app1[8*i+7]>0?mask[7]:0; + + output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; + } +} + diff --git a/lib/src/phy/fec/turbodecoder_simd.c b/lib/src/phy/fec/turbodecoder_simd.c deleted file mode 100644 index f9bfbbd44..000000000 --- a/lib/src/phy/fec/turbodecoder_simd.c +++ /dev/null @@ -1,542 +0,0 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -#include -#include -#include -#include -#include -#include - -#include "srslte/phy/fec/turbodecoder_simd.h" -#include "srslte/phy/utils/vector.h" - -#include - -#define NUMSTATES 8 -#define NINPUTS 2 -#define TAIL 3 -#define TOTALTAIL 12 - -#define INF 10000 -#define ZERO 0 - - -#ifdef LV_HAVE_SSE -#include - -// Define SSE/AVX implementations -void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb); -void map_sse_alpha(map_gen_t * s, uint32_t long_cb); -void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb); - -#ifdef LV_HAVE_AVX2 -void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb); -void map_avx_alpha(map_gen_t * s, uint32_t long_cb); -void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb); -#endif - - -void map_simd_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_MAX_NPAR], uint32_t nof_cb, uint32_t long_cb) -{ - if (nof_cb == 1) { - map_sse_beta(s, output[0], long_cb); - } -#ifdef LV_HAVE_AVX2 - else if (nof_cb == 2) { - map_avx_beta(s, output, long_cb); - } -#endif -} - -void map_simd_alpha(map_gen_t * s, uint32_t nof_cb, uint32_t long_cb) -{ - if (nof_cb == 1) { - map_sse_alpha(s, long_cb); - } -#ifdef LV_HAVE_AVX2 - else if (nof_cb == 2) { - map_avx_alpha(s, long_cb); - } -#endif -} -void map_simd_gamma(map_gen_t * s, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t nof_cb, uint32_t long_cb) -{ - if (nof_cb == 1) { - map_sse_gamma(s, input, app, parity, long_cb); - } -#ifdef LV_HAVE_AVX2 - else if (nof_cb == 2) { - map_avx_gamma(s, input, app, parity, cbidx, long_cb); - } -#endif -} - -/* Inititalizes constituent decoder object */ -int map_simd_init(map_gen_t * h, uint32_t max_par_cb, uint32_t max_long_cb) -{ - bzero(h, sizeof(map_gen_t)); - - h->max_par_cb = max_par_cb; - h->max_long_cb = max_long_cb; - - h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb); - if (!h->alpha) { - perror("srslte_vec_malloc"); - return -1; - } - h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb); - if (!h->branch) { - perror("srslte_vec_malloc"); - return -1; - } - return 0; -} - -void map_simd_free(map_gen_t * h) -{ - if (h->alpha) { - free(h->alpha); - } - if (h->branch) { - free(h->branch); - } - bzero(h, sizeof(map_gen_t)); -} - -/* Runs one instance of a decoder */ -void map_simd_dec(map_gen_t * h, int16_t * input[SRSLTE_TDEC_MAX_NPAR], int16_t *app[SRSLTE_TDEC_MAX_NPAR], int16_t * parity[SRSLTE_TDEC_MAX_NPAR], - int16_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t cb_mask, uint32_t long_cb) -{ - - uint32_t nof_cb = 1; - int16_t *outptr[SRSLTE_TDEC_MAX_NPAR] = { NULL, NULL }; - - // Compute branch metrics - switch(cb_mask) { - case 1: - nof_cb = 1; - outptr[0] = output[0]; - map_simd_gamma(h, input[0], app?app[0]:NULL, parity[0], 0, 1, long_cb); - break; - case 2: - nof_cb = 1; - outptr[0] = output[1]; - map_simd_gamma(h, input[1], app?app[1]:NULL, parity[1], 0, 1, long_cb); - break; - case 3: - nof_cb = 2; - for (int i=0;i<2;i++) { - outptr[i] = output[i]; - map_simd_gamma(h, input[i], app?app[i]:NULL, parity[i], i, 2, long_cb); - } - break; - } - - // Forward recursion - map_simd_alpha(h, nof_cb, long_cb); - - // Backwards recursion + LLR computation - map_simd_beta(h, outptr, nof_cb, long_cb); -} - -/* Initializes the turbo decoder object */ -int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_par_cb, uint32_t max_long_cb) -{ - int ret = -1; - bzero(h, sizeof(srslte_tdec_simd_t)); - uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL; - - h->max_long_cb = max_long_cb; - h->max_par_cb = max_par_cb; - - for (int i=0;imax_par_cb;i++) { - h->app1[i] = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->app1[i]) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->app2[i] = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->app2[i]) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->ext1[i] = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->ext1[i]) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->ext2[i] = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->ext2[i]) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->syst[i] = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->syst[i]) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->parity0[i] = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->parity0[i]) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->parity1[i] = srslte_vec_malloc(sizeof(int16_t) * len); - if (!h->parity1[i]) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - - } - - if (map_simd_init(&h->dec, h->max_par_cb, h->max_long_cb)) { - goto clean_and_exit; - } - - for (int i=0;iinterleaver[i], srslte_cbsegm_cbsize(i)) < 0) { - goto clean_and_exit; - } - srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i)); - } - h->current_cbidx = -1; - h->cb_mask = 0; - ret = 0; -clean_and_exit:if (ret == -1) { - srslte_tdec_simd_free(h); - } - return ret; -} - -void srslte_tdec_simd_free(srslte_tdec_simd_t * h) -{ - for (int i=0;imax_par_cb;i++) { - if (h->app1[i]) { - free(h->app1[i]); - } - if (h->app2[i]) { - free(h->app2[i]); - } - if (h->ext1[i]) { - free(h->ext1[i]); - } - if (h->ext2[i]) { - free(h->ext2[i]); - } - if (h->syst[i]) { - free(h->syst[i]); - } - if (h->parity0[i]) { - free(h->parity0[i]); - } - if (h->parity1[i]) { - free(h->parity1[i]); - } - } - - map_simd_free(&h->dec); - - for (int i=0;iinterleaver[i]); - } - - bzero(h, sizeof(srslte_tdec_simd_t)); -} - -/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into - * 3 buffers ready to be used by compute_gamma() - */ -void deinterleave_input_simd(srslte_tdec_simd_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb) { - uint32_t i; - - __m128i *inputPtr = (__m128i*) input; - __m128i in0, in1, in2; - __m128i s0, s1, s2, s; - __m128i p00, p01, p02, p0; - __m128i p10, p11, p12, p1; - - __m128i *sysPtr = (__m128i*) h->syst[cbidx]; - __m128i *pa0Ptr = (__m128i*) h->parity0[cbidx]; - __m128i *pa1Ptr = (__m128i*) h->parity1[cbidx]; - - // pick bits 0, 3, 6 from 1st word - __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0); - // pick bits 1, 4, 7 from 2st word - __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff); - // pick bits 2, 5 from 3rd word - __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - - // pick bits 1, 4, 7 from 1st word - __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2); - // pick bits 2, 5, from 2st word - __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff); - // pick bits 0, 3, 6 from 3rd word - __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - - // pick bits 2, 5 from 1st word - __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4); - // pick bits 0, 3, 6, from 2st word - __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff); - // pick bits 1, 4, 7 from 3rd word - __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); - - // Split systematic and parity bits - for (i = 0; i < long_cb/8; i++) { - - in0 = _mm_load_si128(inputPtr); inputPtr++; - in1 = _mm_load_si128(inputPtr); inputPtr++; - in2 = _mm_load_si128(inputPtr); inputPtr++; - - /* Deinterleave Systematic bits */ - s0 = _mm_shuffle_epi8(in0, s0_mask); - s1 = _mm_shuffle_epi8(in1, s1_mask); - s2 = _mm_shuffle_epi8(in2, s2_mask); - s = _mm_or_si128(s0, s1); - s = _mm_or_si128(s, s2); - - _mm_store_si128(sysPtr, s); - sysPtr++; - - /* Deinterleave parity 0 bits */ - p00 = _mm_shuffle_epi8(in0, p00_mask); - p01 = _mm_shuffle_epi8(in1, p01_mask); - p02 = _mm_shuffle_epi8(in2, p02_mask); - p0 = _mm_or_si128(p00, p01); - p0 = _mm_or_si128(p0, p02); - - _mm_store_si128(pa0Ptr, p0); - pa0Ptr++; - - /* Deinterleave parity 1 bits */ - p10 = _mm_shuffle_epi8(in0, p10_mask); - p11 = _mm_shuffle_epi8(in1, p11_mask); - p12 = _mm_shuffle_epi8(in2, p12_mask); - p1 = _mm_or_si128(p10, p11); - p1 = _mm_or_si128(p1, p12); - - _mm_store_si128(pa1Ptr, p1); - pa1Ptr++; - - } - - for (i = 0; i < 3; i++) { - h->syst[cbidx][i+long_cb] = input[3*long_cb + 2*i]; - h->parity0[cbidx][i+long_cb] = input[3*long_cb + 2*i + 1]; - } - for (i = 0; i < 3; i++) { - h->app2[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i]; - h->parity1[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i + 1]; - } - -} - -/* Runs 1 turbo decoder iteration */ -void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb) -{ - - int16_t *tmp_app[SRSLTE_TDEC_MAX_NPAR]; - - if (h->current_cbidx >= 0) { - uint16_t *inter = h->interleaver[h->current_cbidx].forward; - uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; - -#ifndef LV_HAVE_AVX2 - input[1] = NULL; -#endif - - h->cb_mask = (input[0]?1:0) | (input[1]?2:0); - - for (int i=0;imax_par_cb;i++) { - if (h->n_iter[i] == 0 && input[i]) { - //printf("deinterleaveing %d\n",i); - deinterleave_input_simd(h, input[i], i, long_cb); - } - } - - // Add apriori information to decoder 1 - for (int i=0;imax_par_cb;i++) { - if (h->n_iter[i] > 0 && input[i]) { - srslte_vec_sub_sss(h->app1[i], h->ext1[i], h->app1[i], long_cb); - } - } - - // Run MAP DEC #1 - for (int i=0;imax_par_cb;i++) { - if (input[i]) { - tmp_app[i] = h->n_iter[i]?h->app1[i]:NULL; - } else { - tmp_app[i] = NULL; - } - } - map_simd_dec(&h->dec, h->syst, tmp_app, h->parity0, h->ext1, h->cb_mask, long_cb); - - // Convert aposteriori information into extrinsic information - for (int i=0;imax_par_cb;i++) { - if (h->n_iter[i] > 0 && input[i]) { - srslte_vec_sub_sss(h->ext1[i], h->app1[i], h->ext1[i], long_cb); - } - } - - // Interleave extrinsic output of DEC1 to form apriori info for decoder 2 - for (int i=0;imax_par_cb;i++) { - if (input[i]) { - srslte_vec_lut_sss(h->ext1[i], deinter, h->app2[i], long_cb); - } - } - - // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits - map_simd_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, h->cb_mask, long_cb); - - // Deinterleaved extrinsic bits become apriori info for decoder 1 - for (int i=0;imax_par_cb;i++) { - if (input[i]) { - srslte_vec_lut_sss(h->ext2[i], inter, h->app1[i], long_cb); - } - } - - for (int i=0;imax_par_cb;i++) { - if (input[i]) { - h->n_iter[i]++; - } - } - } else { - fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_reset() first\n"); - } -} - -/* Resets the decoder and sets the codeblock length */ -int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb) -{ - if (long_cb > h->max_long_cb) { - fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n", - h->max_long_cb); - return -1; - } - for (int i=0;imax_par_cb;i++) { - h->n_iter[i] = 0; - } - h->cb_mask = 0; - h->current_cbidx = srslte_cbsegm_cbindex(long_cb); - if (h->current_cbidx < 0) { - fprintf(stderr, "Invalid CB length %d\n", long_cb); - return -1; - } - return 0; -} - -int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h, uint32_t cb_idx) -{ - h->n_iter[cb_idx] = 0; - return 0; -} - -int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h, uint32_t cb_idx) -{ - return h->n_iter[cb_idx]; -} - -void tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb) -{ - __m128i zero = _mm_set1_epi16(0); - __m128i lsb_mask = _mm_set1_epi16(1); - - __m128i *appPtr = (__m128i*) h->app1[cbidx]; - __m128i *outPtr = (__m128i*) output; - __m128i ap, out, out0, out1; - - for (uint32_t i = 0; i < long_cb/16; i++) { - ap = _mm_load_si128(appPtr); appPtr++; - out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); - ap = _mm_load_si128(appPtr); appPtr++; - out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask); - - out = _mm_packs_epi16(out0, out1); - _mm_store_si128(outPtr, out); - outPtr++; - } - if (long_cb%16) { - for (int i=0;i<8;i++) { - output[long_cb-8+i] = h->app1[cbidx][long_cb-8+i]>0?1:0; - } - } -} - -void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb) -{ - for (int i=0;imax_par_cb;i++) { - tdec_simd_decision(h, output[i], i, long_cb); - } -} - -void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb) -{ - uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; - - // long_cb is always byte aligned - for (uint32_t i = 0; i < long_cb/8; i++) { - uint8_t out0 = h->app1[cbidx][8*i+0]>0?mask[0]:0; - uint8_t out1 = h->app1[cbidx][8*i+1]>0?mask[1]:0; - uint8_t out2 = h->app1[cbidx][8*i+2]>0?mask[2]:0; - uint8_t out3 = h->app1[cbidx][8*i+3]>0?mask[3]:0; - uint8_t out4 = h->app1[cbidx][8*i+4]>0?mask[4]:0; - uint8_t out5 = h->app1[cbidx][8*i+5]>0?mask[5]:0; - uint8_t out6 = h->app1[cbidx][8*i+6]>0?mask[6]:0; - uint8_t out7 = h->app1[cbidx][8*i+7]>0?mask[7]:0; - - output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; - } -} - -void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb) -{ - for (int i=0;imax_par_cb;i++) { - if (output[i]) { - srslte_tdec_simd_decision_byte_cb(h, output[i], i, long_cb); - } - } -} - - -/* Runs nof_iterations iterations and decides the output bits */ -int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_MAX_NPAR], uint8_t *output[SRSLTE_TDEC_MAX_NPAR], - uint32_t nof_iterations, uint32_t long_cb) -{ - if (srslte_tdec_simd_reset(h, long_cb)) { - return SRSLTE_ERROR; - } - - do { - srslte_tdec_simd_iteration(h, input, long_cb); - } while (h->n_iter[0] < nof_iterations); - - srslte_tdec_simd_decision_byte(h, output, long_cb); - - return SRSLTE_SUCCESS; -} - -#endif - - diff --git a/lib/src/phy/fec/turbodecoder_simd_inter.c b/lib/src/phy/fec/turbodecoder_simd_inter.c deleted file mode 100644 index 3c04e2136..000000000 --- a/lib/src/phy/fec/turbodecoder_simd_inter.c +++ /dev/null @@ -1,299 +0,0 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -#include -#include -#include -#include -#include -#include - -#include "srslte/phy/fec/turbodecoder_simd_inter.h" -#include "srslte/phy/utils/vector.h" - -#define TOTALTAIL 12 - -#ifdef LV_HAVE_SSE -#include - -void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb); -void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb); -void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb); -void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb); - - -static void map_sse_inter_dec(srslte_tdec_simd_inter_t * h, int16_t * input, int16_t * parity, int16_t * output, - uint32_t long_cb) -{ - map_see_inter_alpha(h, input, parity, long_cb); - map_sse_inter_beta(h, input, parity, output, long_cb); -} - -/************************************************ - * - * TURBO DECODER INTERFACE - * - ************************************************/ -int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h, uint32_t max_par_cb, uint32_t max_long_cb) -{ - int ret = -1; - bzero(h, sizeof(srslte_tdec_simd_inter_t)); - uint32_t len = max_long_cb + 12; - - h->max_long_cb = max_long_cb; - h->max_par_cb = max_par_cb; - - h->llr1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); - if (!h->llr1) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->llr2 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); - if (!h->llr2) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->w = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); - if (!h->w) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->syst0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); - if (!h->syst0) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->syst1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); - if (!h->syst1) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); - if (!h->parity0) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb); - if (!h->parity1) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - h->alpha = srslte_vec_malloc(sizeof(int16_t) * 8*(len+12) * h->max_par_cb); - if (!h->alpha) { - perror("srslte_vec_malloc"); - goto clean_and_exit; - } - - for (int i=0;iinterleaver[i], srslte_cbsegm_cbsize(i)) < 0) { - goto clean_and_exit; - } - srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i)); - } - h->current_cbidx = -1; - ret = 0; -clean_and_exit:if (ret == -1) { - srslte_tdec_simd_inter_free(h); - } - return ret; -} - -void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h) -{ - if (h->llr1) { - free(h->llr1); - } - if (h->llr2) { - free(h->llr2); - } - if (h->w) { - free(h->w); - } - if (h->syst0) { - free(h->syst0); - } - if (h->syst1) { - free(h->syst1); - } - if (h->parity0) { - free(h->parity0); - } - if (h->parity1) { - free(h->parity1); - } - if (h->alpha) { - free(h->alpha); - } - - for (int i=0;iinterleaver[i]); - } - - bzero(h, sizeof(srslte_tdec_simd_inter_t)); -} - - -/* Deinterleave for inter-frame parallelization */ -void extract_input(srslte_tdec_simd_inter_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb) -{ - for (int i=0;isyst0[h->max_par_cb*i+cbidx] = input[3*i+0]; - h->parity0[h->max_par_cb*i+cbidx] = input[3*i+1]; - h->parity1[h->max_par_cb*i+cbidx] = input[3*i+2]; - } - for (int i = long_cb; i < long_cb + 3; i++) { - h->syst0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)]; - h->syst1[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)]; - h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 1]; - h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 2]; - } -} - -void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h, int16_t *input[SRSLTE_TDEC_MAX_NPAR], uint32_t nof_cb, uint32_t long_cb) -{ - - if (h->current_cbidx >= 0) { - - uint16_t *inter = h->interleaver[h->current_cbidx].forward; - uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; - - // Prepare systematic and parity bits for MAP DEC #1 - for (int i=0;in_iter[i] == 0) { - extract_input(h, input[i], i, long_cb); - } - srslte_vec_sum_sss(h->syst0, h->w, h->syst0, long_cb*h->max_par_cb); - } - - // Run MAP DEC #1 - map_sse_inter_dec(h, h->syst0, h->parity0, h->llr1, long_cb); - - // Prepare systematic and parity bits for MAP DEC #1 - sse_inter_extract_syst1(h, inter, long_cb); - - // Run MAP DEC #2 - map_sse_inter_dec(h, h->syst1, h->parity1, h->llr2, long_cb); - - // Update a-priori LLR from the last iteration - sse_inter_update_w(h, deinter, long_cb); - - } else { - fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_inter_reset() first\n"); - } -} - -int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h, uint32_t cb_idx) -{ - for (int i=0;icurrent_long_cb;i++) { - h->w[h->max_par_cb*i+cb_idx] = 0; - } - return 0; -} - -int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h, uint32_t long_cb) -{ - if (long_cb > h->max_long_cb) { - fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n", - h->max_long_cb); - return -1; - } - h->current_long_cb = long_cb; - h->current_cbidx = srslte_cbsegm_cbindex(long_cb); - if (h->current_cbidx < 0) { - fprintf(stderr, "Invalid CB length %d\n", long_cb); - return -1; - } - memset(h->w, 0, sizeof(int16_t) * long_cb * h->max_par_cb); - return 0; -} - -void srslte_tdec_simd_inter_decision_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb) -{ - uint16_t *deinter = h->interleaver[h->current_cbidx].reverse; - uint32_t i; - for (i = 0; i < long_cb; i++) { - output[i] = (h->llr2[h->max_par_cb*deinter[i]+cb_idx] > 0) ? 1 : 0; - } -} - -void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t nof_cb, uint32_t long_cb) -{ - for (int i=0;iinterleaver[h->current_cbidx].reverse; - -#define indexOf_cb(idx, cb) (h->max_par_cb*(deinter[8*i+idx])+cb) - - // long_cb is always byte aligned - for (i = 0; i < long_cb/8; i++) { - uint8_t out0 = h->llr2[indexOf_cb(0, cb_idx)]>0?mask[0]:0; - uint8_t out1 = h->llr2[indexOf_cb(1, cb_idx)]>0?mask[1]:0; - uint8_t out2 = h->llr2[indexOf_cb(2, cb_idx)]>0?mask[2]:0; - uint8_t out3 = h->llr2[indexOf_cb(3, cb_idx)]>0?mask[3]:0; - uint8_t out4 = h->llr2[indexOf_cb(4, cb_idx)]>0?mask[4]:0; - uint8_t out5 = h->llr2[indexOf_cb(5, cb_idx)]>0?mask[5]:0; - uint8_t out6 = h->llr2[indexOf_cb(6, cb_idx)]>0?mask[6]:0; - uint8_t out7 = h->llr2[indexOf_cb(7, cb_idx)]>0?mask[7]:0; - - output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; - } -} - -void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t nof_cb, uint32_t long_cb) -{ - for (int i=0;i -#include -#include -#include -#include -#include - -#include "srslte/phy/fec/turbodecoder_simd.h" -#include "srslte/phy/utils/vector.h" - -#include - -#ifdef LV_HAVE_SSE -#include -#endif - - -#define NUMSTATES 8 -#define NINPUTS 2 -#define TAIL 3 -#define TOTALTAIL 12 - -#define INF 10000 -#define ZERO 0 - - -#ifdef LV_HAVE_SSE - -/* -static void print_128i(__m128i x) { - int16_t *s = (int16_t*) &x; - printf("[%d", s[0]); - for (int i=1;i<8;i++) { - printf(",%d", s[i]); - } - printf("]\n"); -} -*/ -//#define use_beta_transposed_max - -#ifndef use_beta_transposed_max - -/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */ -static inline int16_t hMax(__m128i buffer) -{ - __m128i tmp1 = _mm_sub_epi16(_mm_set1_epi16(0x7FFF), buffer); - __m128i tmp3 = _mm_minpos_epu16(tmp1); - return (int16_t)(_mm_cvtsi128_si32(tmp3)); -} - -/* Computes beta values */ -void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) -{ - int k; - uint32_t end = long_cb + 3; - const __m128i *alphaPtr = (const __m128i*) s->alpha; - - __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); - __m128i g, bp, bn, alpha_k; - - /* Define the shuffle constant for the positive beta */ - __m128i shuf_bp = _mm_set_epi8( - 15, 14, // 7 - 7, 6, // 3 - 5, 4, // 2 - 13, 12, // 6 - 11, 10, // 5 - 3, 2, // 1 - 1, 0, // 0 - 9, 8 // 4 - ); - - /* Define the shuffle constant for the negative beta */ - __m128i shuf_bn = _mm_set_epi8( - 7, 6, // 3 - 15, 14, // 7 - 13, 12, // 6 - 5, 4, // 2 - 3, 2, // 1 - 11, 10, // 5 - 9, 8, // 4 - 1, 0 // 0 - ); - - alphaPtr += long_cb-1; - - /* Define shuffle for branch costs */ - __m128i shuf_g[4]; - shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); - shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); - shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); - shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); - __m128i gv; - int16_t *b = &s->branch[2*long_cb-8]; - __m128i *gPtr = (__m128i*) b; - /* Define shuffle for beta normalization */ - __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); - - /* This defines a beta computation step: - * Adds and substracts the branch metrics to the previous beta step, - * shuffles the states according to the trellis path and selects maximum state - */ -#define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\ - bn = _mm_sub_epi16(beta_k, g);\ - bp = _mm_shuffle_epi8(bp, shuf_bp);\ - bn = _mm_shuffle_epi8(bn, shuf_bn);\ - beta_k = _mm_max_epi16(bp, bn); - - /* Loads the alpha metrics from memory and adds them to the temporal bn and bp - * metrics. Then computes horizontal maximum of both metrics and computes difference - */ -#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\ - BETA_STEP(g)\ - alpha_k = _mm_load_si128(alphaPtr);\ - alphaPtr--;\ - bp = _mm_add_epi16(bp, alpha_k);\ - bn = _mm_add_epi16(bn, alpha_k);\ - output[k-d] = hMax(bn)-hMax(bp); - - /* The tail does not require to load alpha or produce outputs. Only update - * beta metrics accordingly */ - for (k=end-1; k>=long_cb; k--) { - int16_t g0 = s->branch[2*k]; - int16_t g1 = s->branch[2*k+1]; - g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1); - BETA_STEP(g); - } - - /* We inline 2 trelis steps for each normalization */ - __m128i norm; - for (; k >= 0; k-=8) { - gv = _mm_load_si128(gPtr); - gPtr--; - - BETA_STEP_CNT(0,0); - BETA_STEP_CNT(1,1); - BETA_STEP_CNT(2,2); - BETA_STEP_CNT(3,3); - norm = _mm_shuffle_epi8(beta_k, shuf_norm); - beta_k = _mm_sub_epi16(beta_k, norm); - gv = _mm_load_si128(gPtr); - gPtr--; - BETA_STEP_CNT(0,4); - BETA_STEP_CNT(1,5); - BETA_STEP_CNT(2,6); - BETA_STEP_CNT(3,7); - - norm = _mm_shuffle_epi8(beta_k, shuf_norm); - beta_k = _mm_sub_epi16(beta_k, norm); - } -} - -#endif - -/* Computes alpha metrics */ -void map_sse_alpha(map_gen_t * s, uint32_t long_cb) -{ - uint32_t k; - int16_t *alpha = s->alpha; - uint32_t i; - - alpha[0] = 0; - for (i = 1; i < 8; i++) { - alpha[i] = -INF; - } - - /* Define the shuffle constant for the positive alpha */ - __m128i shuf_ap = _mm_set_epi8( - 15, 14, // 7 - 9, 8, // 4 - 7, 6, // 3 - 1, 0, // 0 - 13, 12, // 6 - 11, 10, // 5 - 5, 4, // 2 - 3, 2 // 1 - ); - - /* Define the shuffle constant for the negative alpha */ - __m128i shuf_an = _mm_set_epi8( - 13, 12, // 6 - 11, 10, // 5 - 5, 4, // 2 - 3, 2, // 1 - 15, 14, // 7 - 9, 8, // 4 - 7, 6, // 3 - 1, 0 // 0 - ); - - /* Define shuffle for branch costs */ - __m128i shuf_g[4]; - shuf_g[0] = _mm_set_epi8(3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2); - shuf_g[1] = _mm_set_epi8(7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6); - shuf_g[2] = _mm_set_epi8(11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10); - shuf_g[3] = _mm_set_epi8(15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14); - - __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); - - __m128i* alphaPtr = (__m128i*) alpha; - alphaPtr++; - - __m128i gv; - __m128i *gPtr = (__m128i*) s->branch; - __m128i g, ap, an; - - __m128i alpha_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); - - /* This defines a alpha computation step: - * Adds and substracts the branch metrics to the previous alpha step, - * shuffles the states according to the trellis path and selects maximum state - */ -#define ALPHA_STEP(c) g = _mm_shuffle_epi8(gv, shuf_g[c]); \ - ap = _mm_add_epi16(alpha_k, g);\ - an = _mm_sub_epi16(alpha_k, g);\ - ap = _mm_shuffle_epi8(ap, shuf_ap);\ - an = _mm_shuffle_epi8(an, shuf_an);\ - alpha_k = _mm_max_epi16(ap, an);\ - _mm_store_si128(alphaPtr, alpha_k);\ - alphaPtr++; \ - - /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */ - __m128i norm; - for (k = 0; k < long_cb/8; k++) { - gv = _mm_load_si128(gPtr); - gPtr++; - ALPHA_STEP(0); - ALPHA_STEP(1); - ALPHA_STEP(2); - ALPHA_STEP(3); - norm = _mm_shuffle_epi8(alpha_k, shuf_norm); - alpha_k = _mm_sub_epi16(alpha_k, norm); - gv = _mm_load_si128(gPtr); - gPtr++; - ALPHA_STEP(0); - ALPHA_STEP(1); - ALPHA_STEP(2); - ALPHA_STEP(3); - norm = _mm_shuffle_epi8(alpha_k, shuf_norm); - alpha_k = _mm_sub_epi16(alpha_k, norm); - } -} - -/* Compute branch metrics (gamma) */ -void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) -{ - __m128i res00, res10, res01, res11, res0, res1; - __m128i in, ap, pa, g1, g0; - - __m128i *inPtr = (__m128i*) input; - __m128i *appPtr = (__m128i*) app; - __m128i *paPtr = (__m128i*) parity; - __m128i *resPtr = (__m128i*) h->branch; - - __m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); - __m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); - __m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); - __m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); - - for (int i=0;ibranch[2*i] = (input[i] - parity[i])/2; - h->branch[2*i+1] = (input[i] + parity[i])/2; - } -} - - - - - - -/*********************** - * - * This is an attempt to parallelize the horizontal max - * by doing a 8x8 tranpose of the vectors and computing max - * in cascade. However since we need to store 16 registers - * for the positive and negative values the performance is not very good - */ - - -#ifdef use_beta_transposed_max - -static inline __m128i transposed_max(__m128i a, __m128i b, __m128i c, __m128i d, - __m128i e, __m128i f, __m128i g, __m128i h) -{ - // Transpose 8 vectors - __m128i t0 = _mm_unpacklo_epi16(a, b); - __m128i t1 = _mm_unpacklo_epi16(c, d); - __m128i t2 = _mm_unpacklo_epi16(e, f); - __m128i t3 = _mm_unpacklo_epi16(g, h); - __m128i t4 = _mm_unpackhi_epi16(a, b); - __m128i t5 = _mm_unpackhi_epi16(c, d); - __m128i t6 = _mm_unpackhi_epi16(e, f); - __m128i t7 = _mm_unpackhi_epi16(g, h); - - __m128i s0 = _mm_unpacklo_epi32(t0, t1); - __m128i s1 = _mm_unpackhi_epi32(t0, t1); - __m128i s2 = _mm_unpacklo_epi32(t2, t3); - __m128i s3 = _mm_unpackhi_epi32(t2, t3); - __m128i s4 = _mm_unpacklo_epi32(t4, t5); - __m128i s5 = _mm_unpackhi_epi32(t4, t5); - __m128i s6 = _mm_unpacklo_epi32(t6, t7); - __m128i s7 = _mm_unpackhi_epi32(t6, t7); - - __m128i x0 = _mm_unpacklo_epi64(s0, s2); - __m128i x1 = _mm_unpackhi_epi64(s0, s2); - __m128i x2 = _mm_unpacklo_epi64(s1, s3); - __m128i x3 = _mm_unpackhi_epi64(s1, s3); - __m128i x4 = _mm_unpacklo_epi64(s4, s6); - __m128i x5 = _mm_unpackhi_epi64(s4, s6); - __m128i x6 = _mm_unpacklo_epi64(s5, s7); - __m128i x7 = _mm_unpackhi_epi64(s5, s7); - - // Cascade max on the transposed vector - __m128i res = _mm_max_epi16(x0, - _mm_max_epi16(x1, - _mm_max_epi16(x2, - _mm_max_epi16(x3, - _mm_max_epi16(x4, - _mm_max_epi16(x5, - _mm_max_epi16(x6, - x7))))))); - - return res; -} - -void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) -{ - int k; - uint32_t end = long_cb + 3; - const __m128i *alphaPtr = (const __m128i*) s->alpha; - - __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); - __m128i g, alpha_k; - __m128i bn, bn_0, bn_1, bn_2, bn_3, bn_4, bn_5, bn_6, bn_7; - __m128i bp, bp_0, bp_1, bp_2, bp_3, bp_4, bp_5, bp_6, bp_7; - - /* Define the shuffle constant for the positive beta */ - __m128i shuf_bp = _mm_set_epi8( - 15, 14, // 7 - 7, 6, // 3 - 5, 4, // 2 - 13, 12, // 6 - 11, 10, // 5 - 3, 2, // 1 - 1, 0, // 0 - 9, 8 // 4 - ); - - /* Define the shuffle constant for the negative beta */ - __m128i shuf_bn = _mm_set_epi8( - 7, 6, // 3 - 15, 14, // 7 - 13, 12, // 6 - 5, 4, // 2 - 3, 2, // 1 - 11, 10, // 5 - 9, 8, // 4 - 1, 0 // 0 - ); - - alphaPtr += long_cb-1; - - /* Define shuffle for branch costs */ - __m128i shuf_g[4]; - shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); - shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); - shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); - shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); - __m128i gv; - int16_t *b = &s->branch[2*long_cb-8]; - __m128i *gPtr = (__m128i*) b; - /* Define shuffle for beta normalization */ - __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); - - /* This defines a beta computation step: - * Adds and substracts the branch metrics to the previous beta step, - * shuffles the states according to the trellis path and selects maximum state - */ -#define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\ - bn = _mm_sub_epi16(beta_k, g);\ - bp = _mm_shuffle_epi8(bp, shuf_bp);\ - bn = _mm_shuffle_epi8(bn, shuf_bn);\ - beta_k = _mm_max_epi16(bp, bn); - - /* Loads the alpha metrics from memory and adds them to the temporal bn and bp - * metrics. - */ -#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\ - BETA_STEP(g)\ - alpha_k = _mm_load_si128(alphaPtr);\ - alphaPtr--;\ - bp_##d = _mm_add_epi16(bp, alpha_k);\ - bn_##d = _mm_add_epi16(bn, alpha_k);\ - - /* The tail does not require to load alpha or produce outputs. Only update - * beta metrics accordingly */ - for (k=end-1; k>=long_cb; k--) { - int16_t g0 = s->branch[2*k]; - int16_t g1 = s->branch[2*k+1]; - g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1); - BETA_STEP(g); - } - - /* We inline 2 trelis steps for each normalization */ - __m128i norm; - __m128i *outPtr = (__m128i*) &output[long_cb-8]; - for (; k >= 0; k-=8) { - gv = _mm_load_si128(gPtr); - gPtr--; - - BETA_STEP_CNT(0,0); - BETA_STEP_CNT(1,1); - BETA_STEP_CNT(2,2); - BETA_STEP_CNT(3,3); - norm = _mm_shuffle_epi8(beta_k, shuf_norm); - beta_k = _mm_sub_epi16(beta_k, norm); - gv = _mm_load_si128(gPtr); - gPtr--; - BETA_STEP_CNT(0,4); - BETA_STEP_CNT(1,5); - BETA_STEP_CNT(2,6); - BETA_STEP_CNT(3,7); - norm = _mm_shuffle_epi8(beta_k, shuf_norm); - beta_k = _mm_sub_epi16(beta_k, norm); - - __m128i bn_transp = transposed_max(bn_7, bn_6, bn_5, bn_4, bn_3, bn_2, bn_1, bn_0); - __m128i bp_transp = transposed_max(bp_7, bp_6, bp_5, bp_4, bp_3, bp_2, bp_1, bp_0); - __m128i outval = _mm_sub_epi16(bp_transp,bn_transp); - _mm_store_si128(outPtr, outval); - outPtr--; - } -} -#endif - - - - -#endif - - +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "srslte/phy/fec/turbodecoder_sse.h" +#include "srslte/phy/utils/vector.h" + +#include + +#ifdef LV_HAVE_SSE +#include +#include + +#endif + + +#define NUMSTATES 8 +#define NINPUTS 2 +#define TAIL 3 +#define TOTALTAIL 12 + +#define INF 10000 + + +#ifdef LV_HAVE_SSE + + +#define debug_enabled 0 + +#if debug_enabled +#define debug_state(c,d) printf("k=%5d, in=%5d, pa=%5d, out=%5d, alpha=", k-d,\ + s->branch[2*(k-d)] + s->branch[2*(k-d)+1], \ + -s->branch[2*(k-d)] + s->branch[2*(k-d)+1], output[k-d]);print_128i(alpha_k);\ + printf(", beta=");print_128i(beta_k);printf("\n"); + +static void print_128i(__m128i x) { + int16_t *s = (int16_t*) &x; + printf("[%5d", s[0]); + for (int i=1;i<8;i++) { + printf(",%5d", s[i]); + } + printf("]"); +} + +static uint32_t max_128i(__m128i x) { + int16_t *s = (int16_t*) &x; + int16_t m = -INF; + uint32_t max = 0; + for (int i=1;i<8;i++) { + if (s[i] > m) { + max = i; + m = s[i]; + } + } + return max; +} + +#else +#define debug_state(c,d) +#endif + + +//#define use_beta_transposed_max + +#ifndef use_beta_transposed_max + +/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */ +static inline int16_t hMax(__m128i buffer) +{ + __m128i tmp1 = _mm_sub_epi16(_mm_set1_epi16(0x7FFF), buffer); + __m128i tmp3 = _mm_minpos_epu16(tmp1); + return (int16_t)(_mm_cvtsi128_si32(tmp3)); +} + +/* Computes beta values */ +void tdec_sse_beta(tdec_sse_t * s, int16_t * output, uint32_t long_cb) +{ + int k; + uint32_t end = long_cb + 3; + const __m128i *alphaPtr = (const __m128i*) s->alpha; + + __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); + __m128i g, bp, bn, alpha_k; + + /* Define the shuffle constant for the positive beta */ + __m128i shuf_bp = _mm_set_epi8( + 15, 14, // 7 + 7, 6, // 3 + 5, 4, // 2 + 13, 12, // 6 + 11, 10, // 5 + 3, 2, // 1 + 1, 0, // 0 + 9, 8 // 4 + ); + + /* Define the shuffle constant for the negative beta */ + __m128i shuf_bn = _mm_set_epi8( + 7, 6, // 3 + 15, 14, // 7 + 13, 12, // 6 + 5, 4, // 2 + 3, 2, // 1 + 11, 10, // 5 + 9, 8, // 4 + 1, 0 // 0 + ); + + alphaPtr += long_cb-1; + + /* Define shuffle for branch costs */ + __m128i shuf_g[4]; + shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); + shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); + shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); + shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); + __m128i gv; + int16_t *b = &s->branch[2*long_cb-8]; + __m128i *gPtr = (__m128i*) b; + /* Define shuffle for beta normalization */ + __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); + + /* This defines a beta computation step: + * Adds and substracts the branch metrics to the previous beta step, + * shuffles the states according to the trellis path and selects maximum state + */ +#define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\ + bn = _mm_sub_epi16(beta_k, g);\ + bp = _mm_shuffle_epi8(bp, shuf_bp);\ + bn = _mm_shuffle_epi8(bn, shuf_bn);\ + beta_k = _mm_max_epi16(bp, bn); + + /* Loads the alpha metrics from memory and adds them to the temporal bn and bp + * metrics. Then computes horizontal maximum of both metrics and computes difference + */ +#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\ + BETA_STEP(g)\ + alpha_k = _mm_load_si128(alphaPtr);\ + alphaPtr--;\ + bp = _mm_add_epi16(bp, alpha_k);\ + bn = _mm_add_epi16(bn, alpha_k);\ + output[k-d] = hMax(bn)-hMax(bp);\ + debug_state(c,d); + + + /* The tail does not require to load alpha or produce outputs. Only update + * beta metrics accordingly */ + for (k=end-1; k>=long_cb; k--) { + int16_t g0 = s->branch[2*k]; + int16_t g1 = s->branch[2*k+1]; + g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1); + BETA_STEP(g); + } + + /* We inline 2 trelis steps for each normalization */ + __m128i norm; + for (; k >= 0; k-=8) { + gv = _mm_load_si128(gPtr); + gPtr--; + + BETA_STEP_CNT(0,0); + BETA_STEP_CNT(1,1); + BETA_STEP_CNT(2,2); + BETA_STEP_CNT(3,3); + norm = _mm_shuffle_epi8(beta_k, shuf_norm); + beta_k = _mm_sub_epi16(beta_k, norm); + gv = _mm_load_si128(gPtr); + gPtr--; + BETA_STEP_CNT(0,4); + BETA_STEP_CNT(1,5); + BETA_STEP_CNT(2,6); + BETA_STEP_CNT(3,7); + + norm = _mm_shuffle_epi8(beta_k, shuf_norm); + beta_k = _mm_sub_epi16(beta_k, norm); + } +} + +#endif + +/* Computes alpha metrics */ +void tdec_sse_alpha(tdec_sse_t * s, uint32_t long_cb) +{ + uint32_t k; + int16_t *alpha = s->alpha; + uint32_t i; + + alpha[0] = 0; + for (i = 1; i < 8; i++) { + alpha[i] = -INF; + } + + /* Define the shuffle constant for the positive alpha */ + __m128i shuf_ap = _mm_set_epi8( + 15, 14, // 7 + 9, 8, // 4 + 7, 6, // 3 + 1, 0, // 0 + 13, 12, // 6 + 11, 10, // 5 + 5, 4, // 2 + 3, 2 // 1 + ); + + /* Define the shuffle constant for the negative alpha */ + __m128i shuf_an = _mm_set_epi8( + 13, 12, // 6 + 11, 10, // 5 + 5, 4, // 2 + 3, 2, // 1 + 15, 14, // 7 + 9, 8, // 4 + 7, 6, // 3 + 1, 0 // 0 + ); + + /* Define shuffle for branch costs */ + __m128i shuf_g[4]; + shuf_g[0] = _mm_set_epi8(3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2); + shuf_g[1] = _mm_set_epi8(7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6); + shuf_g[2] = _mm_set_epi8(11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10); + shuf_g[3] = _mm_set_epi8(15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14); + + __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); + + __m128i* alphaPtr = (__m128i*) alpha; + alphaPtr++; + + __m128i gv; + __m128i *gPtr = (__m128i*) s->branch; + __m128i g, ap, an; + + __m128i alpha_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); + + /* This defines a alpha computation step: + * Adds and substracts the branch metrics to the previous alpha step, + * shuffles the states according to the trellis path and selects maximum state + */ +#define ALPHA_STEP(c) g = _mm_shuffle_epi8(gv, shuf_g[c]); \ + ap = _mm_add_epi16(alpha_k, g);\ + an = _mm_sub_epi16(alpha_k, g);\ + ap = _mm_shuffle_epi8(ap, shuf_ap);\ + an = _mm_shuffle_epi8(an, shuf_an);\ + alpha_k = _mm_max_epi16(ap, an);\ + _mm_store_si128(alphaPtr, alpha_k);\ + alphaPtr++; \ + + /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */ + __m128i norm; + for (k = 0; k < long_cb/8; k++) { + gv = _mm_load_si128(gPtr); + gPtr++; + ALPHA_STEP(0); + ALPHA_STEP(1); + ALPHA_STEP(2); + ALPHA_STEP(3); + norm = _mm_shuffle_epi8(alpha_k, shuf_norm); + alpha_k = _mm_sub_epi16(alpha_k, norm); + gv = _mm_load_si128(gPtr); + gPtr++; + ALPHA_STEP(0); + ALPHA_STEP(1); + ALPHA_STEP(2); + ALPHA_STEP(3); + norm = _mm_shuffle_epi8(alpha_k, shuf_norm); + alpha_k = _mm_sub_epi16(alpha_k, norm); + } +} + +/* Compute branch metrics (gamma) */ +void tdec_sse_gamma(tdec_sse_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) +{ + __m128i res00, res10, res01, res11, res0, res1; + __m128i in, ap, pa, g1, g0; + + __m128i *inPtr = (__m128i*) input; + __m128i *appPtr = (__m128i*) app; + __m128i *paPtr = (__m128i*) parity; + __m128i *resPtr = (__m128i*) h->branch; + + __m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); + __m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); + __m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); + __m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); + + for (int i=0;ibranch[2*i], h->branch[2*i+1]); + } + + for (int i=long_cb;ibranch[2*i] = (input[i] - parity[i])/2; + h->branch[2*i+1] = (input[i] + parity[i])/2; + } +} + + +/* Inititalizes constituent decoder object */ +int tdec_sse_init(void **hh, uint32_t max_long_cb) +{ + *hh = calloc(1, sizeof(tdec_sse_t)); + + tdec_sse_t *h = (tdec_sse_t*) *hh; + + h->max_long_cb = max_long_cb; + + h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + TOTALTAIL + 1) * NUMSTATES); + if (!h->alpha) { + perror("srslte_vec_malloc"); + return -1; + } + h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + TOTALTAIL + 1) * NUMSTATES); + if (!h->branch) { + perror("srslte_vec_malloc"); + return -1; + } + return 1; +} + +void tdec_sse_free(void *hh) +{ + tdec_sse_t *h = (tdec_sse_t*) hh; + + if (h->alpha) { + free(h->alpha); + } + if (h->branch) { + free(h->branch); + } + bzero(h, sizeof(tdec_sse_t)); +} + +/* Runs one instance of a decoder */ +void tdec_sse_dec(void *hh, int16_t * input, int16_t *app, int16_t * parity, + int16_t *output, uint32_t long_cb) +{ + tdec_sse_t *h = (tdec_sse_t*) hh; + + // Compute branch metrics + tdec_sse_gamma(h, input, app, parity, long_cb); + + // Forward recursion + tdec_sse_alpha(h, long_cb); + + // Backwards recursion + LLR computation + tdec_sse_beta(h, output, long_cb); +} + +/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into + * 3 buffers ready to be used by compute_gamma() + */ +void tdec_sse_extract_input(int16_t *input, int16_t *syst0, int16_t *app2, int16_t *parity0, int16_t *parity1, uint32_t long_cb) { + uint32_t i; + + __m128i *inputPtr = (__m128i*) input; + __m128i in0, in1, in2; + __m128i s0, s1, s2, s; + __m128i p00, p01, p02, p0; + __m128i p10, p11, p12, p1; + + __m128i *sysPtr = (__m128i*) syst0; + __m128i *pa0Ptr = (__m128i*) parity0; + __m128i *pa1Ptr = (__m128i*) parity1; + + // pick bits 0, 3, 6 from 1st word + __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0); + // pick bits 1, 4, 7 from 2st word + __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff); + // pick bits 2, 5 from 3rd word + __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + // pick bits 1, 4, 7 from 1st word + __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2); + // pick bits 2, 5, from 2st word + __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff); + // pick bits 0, 3, 6 from 3rd word + __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + // pick bits 2, 5 from 1st word + __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4); + // pick bits 0, 3, 6, from 2st word + __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff); + // pick bits 1, 4, 7 from 3rd word + __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + // Split systematic and parity bits + for (i = 0; i < long_cb/8; i++) { + + in0 = _mm_load_si128(inputPtr); inputPtr++; + in1 = _mm_load_si128(inputPtr); inputPtr++; + in2 = _mm_load_si128(inputPtr); inputPtr++; + + /* Deinterleave Systematic bits */ + s0 = _mm_shuffle_epi8(in0, s0_mask); + s1 = _mm_shuffle_epi8(in1, s1_mask); + s2 = _mm_shuffle_epi8(in2, s2_mask); + s = _mm_or_si128(s0, s1); + s = _mm_or_si128(s, s2); + + _mm_store_si128(sysPtr, s); + sysPtr++; + + /* Deinterleave parity 0 bits */ + p00 = _mm_shuffle_epi8(in0, p00_mask); + p01 = _mm_shuffle_epi8(in1, p01_mask); + p02 = _mm_shuffle_epi8(in2, p02_mask); + p0 = _mm_or_si128(p00, p01); + p0 = _mm_or_si128(p0, p02); + + _mm_store_si128(pa0Ptr, p0); + pa0Ptr++; + + /* Deinterleave parity 1 bits */ + p10 = _mm_shuffle_epi8(in0, p10_mask); + p11 = _mm_shuffle_epi8(in1, p11_mask); + p12 = _mm_shuffle_epi8(in2, p12_mask); + p1 = _mm_or_si128(p10, p11); + p1 = _mm_or_si128(p1, p12); + + _mm_store_si128(pa1Ptr, p1); + pa1Ptr++; + + } + + for (i = 0; i < 3; i++) { + syst0[i+long_cb] = input[3*long_cb + 2*i]; + parity0[i+long_cb] = input[3*long_cb + 2*i + 1]; + } + for (i = 0; i < 3; i++) { + app2[i+long_cb] = input[3*long_cb + 6 + 2*i]; + parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1]; + } +} + +void tdec_sse_decision_byte(int16_t *app1, uint8_t *output, uint32_t long_cb) +{ + uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; + + // long_cb is always byte aligned + for (uint32_t i = 0; i < long_cb/8; i++) { + uint8_t out0 = app1[8*i+0]>0?mask[0]:0; + uint8_t out1 = app1[8*i+1]>0?mask[1]:0; + uint8_t out2 = app1[8*i+2]>0?mask[2]:0; + uint8_t out3 = app1[8*i+3]>0?mask[3]:0; + uint8_t out4 = app1[8*i+4]>0?mask[4]:0; + uint8_t out5 = app1[8*i+5]>0?mask[5]:0; + uint8_t out6 = app1[8*i+6]>0?mask[6]:0; + uint8_t out7 = app1[8*i+7]>0?mask[7]:0; + + output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; + } +} + + + + +/*********************** + * + * This is an attempt to parallelize the horizontal max + * by doing a 8x8 tranpose of the vectors and computing max + * in cascade. However since we need to store 16 registers + * for the positive and negative values the performance is not very good + */ + + +#ifdef use_beta_transposed_max + +static inline __m128i transposed_max(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // Transpose 8 vectors + __m128i t0 = _mm_unpacklo_epi16(a, b); + __m128i t1 = _mm_unpacklo_epi16(c, d); + __m128i t2 = _mm_unpacklo_epi16(e, f); + __m128i t3 = _mm_unpacklo_epi16(g, h); + __m128i t4 = _mm_unpackhi_epi16(a, b); + __m128i t5 = _mm_unpackhi_epi16(c, d); + __m128i t6 = _mm_unpackhi_epi16(e, f); + __m128i t7 = _mm_unpackhi_epi16(g, h); + + __m128i s0 = _mm_unpacklo_epi32(t0, t1); + __m128i s1 = _mm_unpackhi_epi32(t0, t1); + __m128i s2 = _mm_unpacklo_epi32(t2, t3); + __m128i s3 = _mm_unpackhi_epi32(t2, t3); + __m128i s4 = _mm_unpacklo_epi32(t4, t5); + __m128i s5 = _mm_unpackhi_epi32(t4, t5); + __m128i s6 = _mm_unpacklo_epi32(t6, t7); + __m128i s7 = _mm_unpackhi_epi32(t6, t7); + + __m128i x0 = _mm_unpacklo_epi64(s0, s2); + __m128i x1 = _mm_unpackhi_epi64(s0, s2); + __m128i x2 = _mm_unpacklo_epi64(s1, s3); + __m128i x3 = _mm_unpackhi_epi64(s1, s3); + __m128i x4 = _mm_unpacklo_epi64(s4, s6); + __m128i x5 = _mm_unpackhi_epi64(s4, s6); + __m128i x6 = _mm_unpacklo_epi64(s5, s7); + __m128i x7 = _mm_unpackhi_epi64(s5, s7); + + // Cascade max on the transposed vector + __m128i res = _mm_max_epi16(x0, + _mm_max_epi16(x1, + _mm_max_epi16(x2, + _mm_max_epi16(x3, + _mm_max_epi16(x4, + _mm_max_epi16(x5, + _mm_max_epi16(x6, + x7))))))); + + return res; +} + +void tdec_sse_beta(tdec_sse_t * s, int16_t * output, uint32_t long_cb) +{ + int k; + uint32_t end = long_cb + 3; + const __m128i *alphaPtr = (const __m128i*) s->alpha; + + __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); + __m128i g, alpha_k; + __m128i bn, bn_0, bn_1, bn_2, bn_3, bn_4, bn_5, bn_6, bn_7; + __m128i bp, bp_0, bp_1, bp_2, bp_3, bp_4, bp_5, bp_6, bp_7; + + /* Define the shuffle constant for the positive beta */ + __m128i shuf_bp = _mm_set_epi8( + 15, 14, // 7 + 7, 6, // 3 + 5, 4, // 2 + 13, 12, // 6 + 11, 10, // 5 + 3, 2, // 1 + 1, 0, // 0 + 9, 8 // 4 + ); + + /* Define the shuffle constant for the negative beta */ + __m128i shuf_bn = _mm_set_epi8( + 7, 6, // 3 + 15, 14, // 7 + 13, 12, // 6 + 5, 4, // 2 + 3, 2, // 1 + 11, 10, // 5 + 9, 8, // 4 + 1, 0 // 0 + ); + + alphaPtr += long_cb-1; + + /* Define shuffle for branch costs */ + __m128i shuf_g[4]; + shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); + shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); + shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); + shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); + __m128i gv; + int16_t *b = &s->branch[2*long_cb-8]; + __m128i *gPtr = (__m128i*) b; + /* Define shuffle for beta normalization */ + __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); + + /* This defines a beta computation step: + * Adds and substracts the branch metrics to the previous beta step, + * shuffles the states according to the trellis path and selects maximum state + */ +#define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\ + bn = _mm_sub_epi16(beta_k, g);\ + bp = _mm_shuffle_epi8(bp, shuf_bp);\ + bn = _mm_shuffle_epi8(bn, shuf_bn);\ + beta_k = _mm_max_epi16(bp, bn); + + /* Loads the alpha metrics from memory and adds them to the temporal bn and bp + * metrics. + */ +#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\ + BETA_STEP(g)\ + alpha_k = _mm_load_si128(alphaPtr);\ + alphaPtr--;\ + bp_##d = _mm_add_epi16(bp, alpha_k);\ + bn_##d = _mm_add_epi16(bn, alpha_k);\ + + /* The tail does not require to load alpha or produce outputs. Only update + * beta metrics accordingly */ + for (k=end-1; k>=long_cb; k--) { + int16_t g0 = s->branch[2*k]; + int16_t g1 = s->branch[2*k+1]; + g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1); + BETA_STEP(g); + } + + /* We inline 2 trelis steps for each normalization */ + __m128i norm; + __m128i *outPtr = (__m128i*) &output[long_cb-8]; + for (; k >= 0; k-=8) { + gv = _mm_load_si128(gPtr); + gPtr--; + + BETA_STEP_CNT(0,0); + BETA_STEP_CNT(1,1); + BETA_STEP_CNT(2,2); + BETA_STEP_CNT(3,3); + norm = _mm_shuffle_epi8(beta_k, shuf_norm); + beta_k = _mm_sub_epi16(beta_k, norm); + gv = _mm_load_si128(gPtr); + gPtr--; + BETA_STEP_CNT(0,4); + BETA_STEP_CNT(1,5); + BETA_STEP_CNT(2,6); + BETA_STEP_CNT(3,7); + norm = _mm_shuffle_epi8(beta_k, shuf_norm); + beta_k = _mm_sub_epi16(beta_k, norm); + + __m128i bn_transp = transposed_max(bn_7, bn_6, bn_5, bn_4, bn_3, bn_2, bn_1, bn_0); + __m128i bp_transp = transposed_max(bp_7, bp_6, bp_5, bp_4, bp_3, bp_2, bp_1, bp_0); + __m128i outval = _mm_sub_epi16(bp_transp,bn_transp); + _mm_store_si128(outPtr, outval); + outPtr--; + } +} +#endif + + + + +#endif + + diff --git a/lib/src/phy/fec/turbodecoder_sse_inter.c b/lib/src/phy/fec/turbodecoder_sse_inter.c deleted file mode 100644 index d75c8a649..000000000 --- a/lib/src/phy/fec/turbodecoder_sse_inter.c +++ /dev/null @@ -1,202 +0,0 @@ -/** - * - * \section COPYRIGHT - * - * Copyright 2013-2015 Software Radio Systems Limited - * - * \section LICENSE - * - * This file is part of the srsLTE library. - * - * srsLTE is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * srsLTE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * A copy of the GNU Affero General Public License can be found in - * the LICENSE file in the top-level directory of this distribution - * and at http://www.gnu.org/licenses/. - * - */ - -#include -#include -#include -#include -#include -#include - -#include "srslte/phy/fec/turbodecoder_simd_inter.h" -#include "srslte/phy/utils/vector.h" - - -#define NCB 8 - -#define INF 10000 - -#ifdef LV_HAVE_SSE -#include - -void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb) -{ - __m128i *llr1Ptr = (__m128i*) h->llr1; - __m128i *wPtr = (__m128i*) h->w; - __m128i *syst1Ptr = (__m128i*) h->syst1; - - for (int i = 0; i < long_cb; i++) { - __m128i llr1 = _mm_load_si128(&llr1Ptr[inter[i]]); - __m128i w = _mm_load_si128(&wPtr[inter[i]]); - _mm_store_si128(syst1Ptr++, _mm_sub_epi16(llr1, w)); - } -} - -void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb) -{ - __m128i *llr1Ptr = (__m128i*) h->llr1; - __m128i *llr2Ptr = (__m128i*) h->llr2; - __m128i *wPtr = (__m128i*) h->w; - __m128i *syst1Ptr = (__m128i*) h->syst1; - - for (int i = 0; i < long_cb; i++) { - __m128i llr1 = _mm_load_si128(llr1Ptr++); - __m128i w = _mm_load_si128(wPtr++); - __m128i llr2 = _mm_load_si128(&llr2Ptr[deinter[i]]); - - _mm_store_si128(syst1Ptr++, _mm_add_epi16(w, _mm_sub_epi16(llr2, llr1))); - } -} - -/* Computes beta values */ -void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb) -{ - __m128i m_b[8], new[8], old[8], max1[8], max0[8]; - __m128i x, y, xy; - __m128i m1, m0; - uint32_t end = long_cb + 3; - uint32_t i; - - __m128i *inputPtr = (__m128i*) input; - __m128i *parityPtr = (__m128i*) parity; - __m128i *outputPtr = (__m128i*) output; - __m128i *alphaPtr = (__m128i*) s->alpha; - - for (int i = 0; i < 8; i++) { - old[i] = _mm_set1_epi16(0); - } - - for (int k = end - 1; k >= 0; k--) { - x = _mm_load_si128(inputPtr++); - y = _mm_load_si128(parityPtr++); - - xy = _mm_add_epi16(x,y); - - m_b[0] = _mm_add_epi16(old[4], xy); - m_b[1] = old[4]; - m_b[2] = _mm_add_epi16(old[5], y); - m_b[3] = _mm_add_epi16(old[5], x); - m_b[4] = _mm_add_epi16(old[6], x); - m_b[5] = _mm_add_epi16(old[6], y); - m_b[6] = old[7]; - m_b[7] = _mm_add_epi16(old[7], xy); - - new[0] = old[0]; - new[1] = _mm_add_epi16(old[0], xy); - new[2] = _mm_add_epi16(old[1], x); - new[3] = _mm_add_epi16(old[1], y); - new[4] = _mm_add_epi16(old[2], y); - new[5] = _mm_add_epi16(old[2], x); - new[6] = _mm_add_epi16(old[3], xy); - new[7] = old[3]; - - for (i = 0; i < 8; i++) { - __m128i alpha = _mm_load_si128(alphaPtr++); - max0[i] = _mm_add_epi16(alpha, m_b[i]); - max1[i] = _mm_add_epi16(alpha, new[i]); - } - - m1 = _mm_max_epi16(max1[0], max1[1]); - m0 = _mm_max_epi16(max0[0], max0[1]); - - for (i = 2; i < 8; i++) { - m1 = _mm_max_epi16(m1, max1[i]); - m0 = _mm_max_epi16(m0, max0[i]); - } - - for (i = 0; i < 8; i++) { - new[i] = _mm_max_epi16(m_b[i], new[i]); - old[i] = new[i]; - } - - __m128i out = _mm_sub_epi16(m1, m0); - _mm_store_si128(outputPtr++, out); - - // normalize - if ((k%4)==0) { - for (int i=1;i<8;i++) { - _mm_sub_epi16(old[i], old[0]); - } - } - } -} - -/* Computes alpha metrics */ -void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb) -{ - __m128i m_b[8], new[8], old[8]; - __m128i x, y, xy; - uint32_t k; - - __m128i *inputPtr = (__m128i*) input; - __m128i *parityPtr = (__m128i*) parity; - __m128i *alphaPtr = (__m128i*) s->alpha; - - old[0] = _mm_set1_epi16(0); - for (int i = 1; i < 8; i++) { - old[i] = _mm_set1_epi16(-INF); - } - - for (k = 0; k < long_cb; k++) { - x = _mm_load_si128(inputPtr++); - y = _mm_load_si128(parityPtr++); - - xy = _mm_add_epi16(x,y); - - m_b[0] = old[0]; - m_b[1] = _mm_add_epi16(old[3], y); - m_b[2] = _mm_add_epi16(old[4], y); - m_b[3] = old[7]; - m_b[4] = old[1]; - m_b[5] = _mm_add_epi16(old[2], y); - m_b[6] = _mm_add_epi16(old[5], y); - m_b[7] = old[6]; - - new[0] = _mm_add_epi16(old[1], xy); - new[1] = _mm_add_epi16(old[2], x); - new[2] = _mm_add_epi16(old[5], x); - new[3] = _mm_add_epi16(old[6], xy); - new[4] = _mm_add_epi16(old[0], xy); - new[5] = _mm_add_epi16(old[3], x); - new[6] = _mm_add_epi16(old[4], x); - new[7] = _mm_add_epi16(old[7], xy); - - for (int i = 0; i < 8; i++) { - new[i] = _mm_max_epi16(m_b[i], new[i]); - old[i] = new[i]; - _mm_store_si128(alphaPtr++, old[i]); - } - - // normalize - if ((k%4)==0) { - for (int i=1;i<8;i++) { - _mm_sub_epi16(old[i], old[0]); - } - } - } -} - -#endif diff --git a/lib/src/phy/modem/demod_soft.c b/lib/src/phy/modem/demod_soft.c index 0ea3ce938..d7edd78eb 100644 --- a/lib/src/phy/modem/demod_soft.c +++ b/lib/src/phy/modem/demod_soft.c @@ -32,18 +32,26 @@ #include "srslte/phy/utils/bit.h" #include "srslte/phy/modem/demod_soft.h" -// AVX implementation not useful for integers. Wait for AVX2 #ifdef LV_HAVE_SSE #include void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols); #endif - #define SCALE_SHORT_CONV_QPSK 100 #define SCALE_SHORT_CONV_QAM16 400 #define SCALE_SHORT_CONV_QAM64 700 +#define SCALE_BYTE_CONV_QPSK 20 +#define SCALE_BYTE_CONV_QAM16 30 +#define SCALE_BYTE_CONV_QAM64 40 + +void demod_bpsk_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols) { + for (int i=0;i 0) { mean_texec_s = SRSLTE_VEC_CMA((float) t[0].tv_usec, mean_texec_s, n-1); } - + + gettimeofday(&t[1], NULL); + srslte_demod_soft_demodulate_b(modulation, symbols, llr_b, num_bits / mod.nbits_x_symbol); + gettimeofday(&t[2], NULL); + get_time_interval(t); + + if (n > 0) { + mean_texec_b = SRSLTE_VEC_CMA((float) t[0].tv_usec, mean_texec_b, n-1); + } + if (SRSLTE_VERBOSE_ISDEBUG()) { printf("bits="); srslte_vec_fprint_b(stdout, input, num_bits); @@ -200,6 +217,9 @@ int main(int argc, char **argv) { printf("llr_s="); srslte_vec_fprint_s(stdout, llr_s, num_bits); + printf("llr_b="); + srslte_vec_fprint_bs(stdout, llr_b, num_bits); + } // Check demodulation errors @@ -212,7 +232,9 @@ int main(int argc, char **argv) { } ret = 0; -clean_exit: +clean_exit: + free(llr_b); + free(llr_s); free(llr); free(symbols); free(output); @@ -220,7 +242,7 @@ clean_exit: srslte_modem_table_free(&mod); - printf("Mean Throughput: %.2f/%.2f. Mbps ExTime: %.2f/%.2f us\n", - num_bits/mean_texec, num_bits/mean_texec_s, mean_texec, mean_texec_s); + printf("Mean Throughput: %.2f/%.2f/%.2f. Mbps ExTime: %.2f/%.2f/%.2f us\n", + num_bits/mean_texec, num_bits/mean_texec_s, num_bits/mean_texec_b, mean_texec, mean_texec_s, mean_texec_b); exit(ret); } diff --git a/lib/src/phy/phch/pdsch.c b/lib/src/phy/phch/pdsch.c index e6dd9f52b..360664819 100644 --- a/lib/src/phy/phch/pdsch.c +++ b/lib/src/phy/phch/pdsch.c @@ -669,6 +669,50 @@ static int srslte_pdsch_codeword_encode(srslte_pdsch_t *q, srslte_pdsch_cfg_t *c return SRSLTE_SUCCESS; } +static void csi_correction(srslte_pdsch_t *q, srslte_pdsch_cfg_t *cfg, uint32_t codeword_idx, uint32_t tb_idx, void *e) +{ + + srslte_ra_nbits_t *nbits = &cfg->nbits[tb_idx]; + uint32_t qm = 0; + switch(cfg->grant.mcs[tb_idx].mod) { + + case SRSLTE_MOD_BPSK: + qm = 1; + break; + case SRSLTE_MOD_QPSK: + qm = 2; + break; + case SRSLTE_MOD_16QAM: + qm = 4; + break; + case SRSLTE_MOD_64QAM: + qm = 6; + break; + default: + ERROR("No modulation"); + } + + const uint32_t csi_max_idx = srslte_vec_max_fi(q->csi[codeword_idx], nbits->nof_bits / qm); + float csi_max = 1.0f; + if (csi_max_idx < nbits->nof_bits / qm) { + csi_max = q->csi[codeword_idx][csi_max_idx]; + } + int8_t *e_b = e; + int16_t *e_s = e; + for (int i = 0; i < nbits->nof_bits / qm; i++) { + const float csi = q->csi[codeword_idx][i] / csi_max; + if (q->llr_is_8bit) { + for (int k = 0; k < qm; k++) { + e_b[qm * i + k] = (int8_t) ((float) e_b[qm * i + k] * csi); + } + } else { + for (int k = 0; k < qm; k++) { + e_s[qm * i + k] = (int16_t) ((float) e_s[qm * i + k] * csi); + } + } + } +} + static int srslte_pdsch_codeword_decode(srslte_pdsch_t *q, srslte_pdsch_cfg_t *cfg, srslte_sch_t *dl_sch, srslte_softbuffer_rx_t *softbuffer, uint16_t rnti, uint8_t *data, uint32_t codeword_idx, uint32_t tb_idx, bool *ack) { @@ -686,47 +730,24 @@ static int srslte_pdsch_codeword_decode(srslte_pdsch_t *q, srslte_pdsch_cfg_t *c * The MAX-log-MAP algorithm used in turbo decoding is unsensitive to SNR estimation, * thus we don't need tot set it in the LLRs normalization */ - srslte_demod_soft_demodulate_s(mcs->mod, q->d[codeword_idx], q->e[codeword_idx], nbits->nof_re); + if (q->llr_is_8bit) { + srslte_demod_soft_demodulate_b(mcs->mod, q->d[codeword_idx], q->e[codeword_idx], nbits->nof_re); + } else { + srslte_demod_soft_demodulate_s(mcs->mod, q->d[codeword_idx], q->e[codeword_idx], nbits->nof_re); + } /* Select scrambling sequence */ srslte_sequence_t *seq = get_user_sequence(q, rnti, codeword_idx, cfg->sf_idx, nbits->nof_bits); /* Bit scrambling */ - srslte_scrambling_s_offset(seq, q->e[codeword_idx], 0, nbits->nof_bits); - - uint32_t qm = 0; - switch(cfg->grant.mcs[tb_idx].mod) { - - case SRSLTE_MOD_BPSK: - qm = 1; - break; - case SRSLTE_MOD_QPSK: - qm = 2; - break; - case SRSLTE_MOD_16QAM: - qm = 4; - break; - case SRSLTE_MOD_64QAM: - qm = 6; - break; - default: - ERROR("No modulation"); + if (q->llr_is_8bit) { + srslte_scrambling_sb_offset(seq, q->e[codeword_idx], 0, nbits->nof_bits); + } else { + srslte_scrambling_s_offset(seq, q->e[codeword_idx], 0, nbits->nof_bits); } - int16_t *e = q->e[codeword_idx]; - if (q->csi_enabled) { - const uint32_t csi_max_idx = srslte_vec_max_fi(q->csi[codeword_idx], nbits->nof_bits / qm); - float csi_max = 1.0f; - if (csi_max_idx < nbits->nof_bits / qm) { - csi_max = q->csi[codeword_idx][csi_max_idx]; - } - for (int i = 0; i < nbits->nof_bits / qm; i++) { - const float csi = q->csi[codeword_idx][i] / csi_max; - for (int k = 0; k < qm; k++) { - e[qm * i + k] = (int16_t) ((float) e[qm * i + k] * csi); - } - } + csi_correction(q, cfg, codeword_idx, tb_idx, q->e[codeword_idx]); } /* Return */ diff --git a/lib/src/phy/phch/pusch.c b/lib/src/phy/phch/pusch.c index 43bd08537..42022efc0 100644 --- a/lib/src/phy/phch/pusch.c +++ b/lib/src/phy/phch/pusch.c @@ -603,7 +603,11 @@ int srslte_pusch_decode(srslte_pusch_t *q, srslte_dft_precoding(&q->dft_precoding, q->z, q->d, cfg->grant.L_prb, cfg->nbits.nof_symb); // Soft demodulation - srslte_demod_soft_demodulate_s(cfg->grant.mcs.mod, q->d, q->q, cfg->nbits.nof_re); + if (q->llr_is_8bit) { + srslte_demod_soft_demodulate_b(cfg->grant.mcs.mod, q->d, q->q, cfg->nbits.nof_re); + } else { + srslte_demod_soft_demodulate_s(cfg->grant.mcs.mod, q->d, q->q, cfg->nbits.nof_re); + } // Generate scrambling sequence if not pre-generated srslte_sequence_t *seq = get_user_sequence(q, rnti, cfg->sf_idx, cfg->nbits.nof_bits); @@ -632,7 +636,11 @@ int srslte_pusch_decode(srslte_pusch_t *q, } // Descrambling - srslte_scrambling_s_offset(seq, q->q, 0, cfg->nbits.nof_bits); + if (q->llr_is_8bit) { + srslte_scrambling_sb_offset(seq, q->q, 0, cfg->nbits.nof_bits); + } else { + srslte_scrambling_s_offset(seq, q->q, 0, cfg->nbits.nof_bits); + } // Decode ret = srslte_ulsch_uci_decode(&q->ul_sch, cfg, softbuffer, q->q, q->g, data, uci_data); diff --git a/lib/src/phy/phch/sch.c b/lib/src/phy/phch/sch.c index 2d7290e9d..f9ea054a0 100644 --- a/lib/src/phy/phch/sch.c +++ b/lib/src/phy/phch/sch.c @@ -32,12 +32,18 @@ #include #include #include +#include #include "srslte/phy/phch/pdsch.h" #include "srslte/phy/utils/bit.h" #include "srslte/phy/utils/debug.h" #include "srslte/phy/utils/vector.h" -#define SRSLTE_PDSCH_MAX_TDEC_ITERS 4 +#define SRSLTE_PDSCH_MAX_TDEC_ITERS 10 + +#ifdef LV_HAVE_SSE +#include +#endif /* LV_HAVE_SSE */ + /* 36.213 Table 8.6.3-1: Mapping of HARQ-ACK offset values and the index signalled by higher layers */ float beta_harq_offset[16] = {2.0, 2.5, 3.125, 4.0, 5.0, 6.250, 8.0, 10.0, @@ -184,8 +190,6 @@ static int encode_tb_off(srslte_sch_t *q, uint32_t Qm, uint32_t rv, uint32_t nof_e_bits, uint8_t *data, uint8_t *e_bits, uint32_t w_offset) { - uint8_t parity[3] = {0, 0, 0}; - uint32_t par; uint32_t i; uint32_t cb_len=0, rp=0, wp=0, rlen=0, n_e=0; int ret = SRSLTE_ERROR_INVALID_INPUTS; @@ -213,17 +217,9 @@ static int encode_tb_off(srslte_sch_t *q, gamma = Gp%cb_segm->C; } - if (data) { + /* Reset TB CRC */ + srslte_crc_set_init(&q->crc_tb, 0); - /* Compute transport block CRC */ - par = srslte_crc_checksum_byte(&q->crc_tb, data, cb_segm->tbs); - - /* parity bits will be appended later */ - parity[0] = (par&(0xff<<16))>>16; - parity[1] = (par&(0xff<<8))>>8; - parity[2] = par&0xff; - } - wp = 0; rp = 0; for (i = 0; i < cb_segm->C; i++) { @@ -252,6 +248,7 @@ static int encode_tb_off(srslte_sch_t *q, cb_len, rlen, wp, rp, n_e); if (data) { + bool last_cb = false; /* Copy data to another buffer, making space for the Codeblock CRC */ if (i < cb_segm->C - 1) { @@ -263,13 +260,19 @@ static int encode_tb_off(srslte_sch_t *q, /* Append Transport Block parity bits to the last CB */ memcpy(q->cb_in, &data[rp/8], (rlen - 24) * sizeof(uint8_t)/8); - memcpy(&q->cb_in[(rlen - 24)/8], parity, 3 * sizeof(uint8_t)); + last_cb = true; } /* Turbo Encoding * If Codeblock CRC is required it is given the CRC instance pointer, otherwise CRC pointer shall be NULL */ - srslte_tcod_encode_lut(&q->encoder, (cb_segm->C > 1) ? &q->crc_cb : NULL, q->cb_in, q->parity_bits, cblen_idx); + srslte_tcod_encode_lut(&q->encoder, + &q->crc_tb, + (cb_segm->C > 1) ? &q->crc_cb : NULL, + q->cb_in, + q->parity_bits, + cblen_idx, + last_cb); } DEBUG("RM cblen_idx=%d, n_e=%d, wp=%d, nof_e_bits=%d\n",cblen_idx, n_e, wp, nof_e_bits); @@ -304,142 +307,117 @@ static int encode_tb(srslte_sch_t *q, bool decode_tb_cb(srslte_sch_t *q, srslte_softbuffer_rx_t *softbuffer, srslte_cbsegm_t *cb_segm, uint32_t Qm, uint32_t rv, uint32_t nof_e_bits, - int16_t *e_bits, uint8_t *data, - uint32_t cb_size_group) + void *e_bits, uint8_t *data) { - bool cb_map[SRSLTE_MAX_CODEBLOCKS]; - - uint32_t cb_idx[SRSLTE_TDEC_MAX_NPAR]; - int16_t *decoder_input[SRSLTE_TDEC_MAX_NPAR]; - - uint32_t nof_cb = cb_size_group?cb_segm->C2:cb_segm->C1; - uint32_t first_cb = cb_size_group?cb_segm->C1:0; - uint32_t cb_len = cb_size_group?cb_segm->K2:cb_segm->K1; - uint32_t cb_len_idx = cb_size_group?cb_segm->K2_idx:cb_segm->K1_idx; - - uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24); - uint32_t Gp = nof_e_bits / Qm; - uint32_t gamma = cb_segm->C>0?Gp%cb_segm->C:Gp; - uint32_t n_e = Qm * (Gp/cb_segm->C); - - if (nof_cb > SRSLTE_MAX_CODEBLOCKS) { + int8_t *e_bits_b = e_bits; + int16_t *e_bits_s = e_bits; + + if (cb_segm->C > SRSLTE_MAX_CODEBLOCKS) { fprintf(stderr, "Error SRSLTE_MAX_CODEBLOCKS=%d\n", SRSLTE_MAX_CODEBLOCKS); - return false; - } - - for (int i=0;idecoder);i++) { - cb_idx[i] = i+first_cb; - decoder_input[i] = NULL; - } - - uint32_t remaining_cb = 0; - for (int i=0;icb_crc[i]; - if (softbuffer->cb_crc[i] == false) { - remaining_cb ++; - } + return false; } - - srslte_tdec_reset(&q->decoder, cb_len); - + q->nof_iterations = 0; - while(remaining_cb>0) { - - // Unratematch the codeblocks left to decode - for (int i=0;idecoder);i++) { - - if (!decoder_input[i] && remaining_cb > 0) { - // Find an unprocessed CB - cb_idx[i]=first_cb; - while(cb_idx[i]C;cb_idx++) + { + /* Do not process blocks with CRC Ok */ + if (softbuffer->cb_crc[cb_idx] == false) { + + uint32_t cb_len = cb_idxC1?cb_segm->K1:cb_segm->K2; + uint32_t cb_len_idx = cb_idxC1?cb_segm->K1_idx:cb_segm->K2_idx; + + uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24); + uint32_t Gp = nof_e_bits / Qm; + uint32_t gamma = cb_segm->C>0?Gp%cb_segm->C:Gp; + uint32_t n_e = Qm * (Gp/cb_segm->C); + + uint32_t rp = cb_idx*n_e; + uint32_t n_e2 = n_e; + + if (cb_idx > cb_segm->C - gamma) { + n_e2 = n_e+Qm; + rp = (cb_segm->C - gamma)*n_e + (cb_idx-(cb_segm->C - gamma))*n_e2; + } + + if (q->llr_is_8bit) { + if (srslte_rm_turbo_rx_lut_8bit(&e_bits_b[rp], (int8_t*) softbuffer->buffer_f[cb_idx], n_e2, cb_len_idx, rv)) { + fprintf(stderr, "Error in rate matching\n"); + return SRSLTE_ERROR; } - if (cb_map[cb_idx[i]] == false) { - cb_map[cb_idx[i]] = true; - - uint32_t rp = cb_idx[i]*n_e; - uint32_t n_e2 = n_e; - - if (cb_idx[i] > cb_segm->C - gamma) { - n_e2 = n_e+Qm; - rp = (cb_segm->C - gamma)*n_e + (cb_idx[i]-(cb_segm->C - gamma))*n_e2; - } - - INFO("CB %d: rp=%d, n_e=%d, i=%d\n", cb_idx[i], rp, n_e2, i); - if (srslte_rm_turbo_rx_lut(&e_bits[rp], softbuffer->buffer_f[cb_idx[i]], n_e2, cb_len_idx, rv)) { - fprintf(stderr, "Error in rate matching\n"); - return SRSLTE_ERROR; - } - - decoder_input[i] = softbuffer->buffer_f[cb_idx[i]]; + } else { + if (srslte_rm_turbo_rx_lut(&e_bits_s[rp], softbuffer->buffer_f[cb_idx], n_e2, cb_len_idx, rv)) { + fprintf(stderr, "Error in rate matching\n"); + return SRSLTE_ERROR; } } - } - - // Run 1 iteration for the codeblocks in queue - srslte_tdec_iteration_par(&q->decoder, decoder_input, cb_len); - // Decide output bits and compute CRC - for (int i=0;idecoder);i++) { - if (decoder_input[i]) { - srslte_tdec_decision_byte_par_cb(&q->decoder, q->cb_in, i, cb_len); + srslte_tdec_new_cb(&q->decoder, cb_len); + + // Run iterations and use CRC for early stopping + bool early_stop = false; + uint32_t cb_noi = 0; + do { + if (q->llr_is_8bit) { + srslte_tdec_iteration_8bit(&q->decoder, (int8_t*) softbuffer->buffer_f[cb_idx], &data[cb_idx*rlen/8]); + } else { + srslte_tdec_iteration(&q->decoder, softbuffer->buffer_f[cb_idx], &data[cb_idx*rlen/8]); + } + q->nof_iterations++; + cb_noi++; + + uint32_t len_crc; + srslte_crc_t *crc_ptr; - uint32_t len_crc; - srslte_crc_t *crc_ptr; - if (cb_segm->C > 1) { - len_crc = cb_len; - crc_ptr = &q->crc_cb; + len_crc = cb_len; + crc_ptr = &q->crc_cb; } else { - len_crc = cb_segm->tbs+24; - crc_ptr = &q->crc_tb; + len_crc = cb_segm->tbs+24; + crc_ptr = &q->crc_tb; } // CRC is OK - if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) { - - memcpy(softbuffer->data[cb_idx[i]], q->cb_in, rlen/8 * sizeof(uint8_t)); - softbuffer->cb_crc[cb_idx[i]] = true; - - q->nof_iterations += srslte_tdec_get_nof_iterations_cb(&q->decoder, i); - - // Reset number of iterations for that CB in the decoder - srslte_tdec_reset_cb(&q->decoder, i); - remaining_cb--; - decoder_input[i] = NULL; - cb_idx[i] = 0; - - // CRC is error and exceeded maximum iterations for this CB. - // Early stop the whole transport block. - } else if (srslte_tdec_get_nof_iterations_cb(&q->decoder, i) >= q->max_iterations) { - INFO("CB %d: Error. CB is erroneous. remaining_cb=%d, i=%d, first_cb=%d, nof_cb=%d\n", - cb_idx[i], remaining_cb, i, first_cb, nof_cb); - - q->nof_iterations += q->max_iterations; - srslte_tdec_reset_cb(&q->decoder, i); - remaining_cb--; - decoder_input[i] = NULL; - cb_idx[i] = 0; + if (!srslte_crc_checksum_byte(crc_ptr, &data[cb_idx*rlen/8], len_crc)) { + + softbuffer->cb_crc[cb_idx] = true; + early_stop = true; + + // CRC is error and exceeded maximum iterations for this CB. + // Early stop the whole transport block. } - } - } + + } while (cb_noi < q->max_iterations && !early_stop); + + INFO("CB %d: rp=%d, n_e=%d, cb_len=%d, CRC=%s, rlen=%d, iterations=%d/%d\n", + cb_idx, rp, n_e2, cb_len, early_stop?"OK":"KO", rlen, cb_noi, q->max_iterations); + + } else { + // Copy decoded data from previous transmissions + uint32_t cb_len = cb_idxC1?cb_segm->K1:cb_segm->K2; + uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24); + memcpy(&data[cb_idx*rlen/8], softbuffer->data[cb_idx], rlen/8 * sizeof(uint8_t)); + } } softbuffer->tb_crc = true; - for (int i = 0; i < nof_cb && softbuffer->tb_crc; i++) { + for (int i = 0; i < cb_segm->C && softbuffer->tb_crc; i++) { /* If one CB failed return false */ softbuffer->tb_crc = softbuffer->cb_crc[i]; } - if (softbuffer->tb_crc) { - for (int i = 0; i < nof_cb; i++) { - memcpy(&data[i * rlen / 8], softbuffer->data[i], rlen/8 * sizeof(uint8_t)); + // If TB CRC failed, save correct CB for next retransmission + if (!softbuffer->tb_crc) { + for (int i = 0; i < cb_segm->C; i++) { + if (softbuffer->cb_crc[i]) { + uint32_t cb_len = iC1?cb_segm->K1:cb_segm->K2; + uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24); + memcpy(softbuffer->data[i], &data[i * rlen / 8], rlen/8 * sizeof(uint8_t)); + } } } - q->nof_iterations /= nof_cb; + q->nof_iterations /= cb_segm->C; return softbuffer->tb_crc; } @@ -484,18 +462,14 @@ static int decode_tb(srslte_sch_t *q, } bool crc_ok = true; - - uint32_t nof_cb_groups = cb_segm->C2>0?2:1; - + data[cb_segm->tbs/8+0] = 0; data[cb_segm->tbs/8+1] = 0; data[cb_segm->tbs/8+2] = 0; - // Process Codeblocks in groups of equal CB size to parallelize according to SRSLTE_TDEC_MAX_NPAR - for (uint32_t i=0;i> (6 - read_bit_idx)) & (uint8_t) 0x03; + q_bits[write_byte_idx] |= w << (6 - write_bit_idx); + + bit_read_idx += 2; + } + } + + for (uint32_t j = ri_min_row; j < rows; j++) { + for (uint32_t i = 0; i < cols; i++) { + uint32_t k = (i * rows + j) * 2; + + if (ri_present[k]) { + /* do nothing */ + } else { + uint32_t read_byte_idx = bit_read_idx / 8; + uint32_t read_bit_idx = bit_read_idx % 8; + uint32_t write_byte_idx = k / 8; + uint32_t write_bit_idx = k % 8; + uint8_t w = (g_bits[read_byte_idx] >> (6 - read_bit_idx)) & (uint8_t) 0x03; + q_bits[write_byte_idx] |= w << (6 - write_bit_idx); + + bit_read_idx += 2; + } + } + } +} + +static void ulsch_interleave_qm4(uint8_t *g_bits, uint32_t rows, uint32_t cols, uint8_t *q_bits, uint32_t ri_min_row, const uint8_t *ri_present) { + uint32_t bit_read_idx = 0; + + for (uint32_t j = 0; j < ri_min_row; j++) { + int32_t i = 0; + +#ifndef LV_HAVE_SSE + __m128i _counter = _mm_slli_epi32(_mm_add_epi32(_mm_mullo_epi32(_counter0,_rows),_mm_set1_epi32(j)), 2); + uint8_t *_g_bits = &g_bits[bit_read_idx/8]; + + /* First bits are aligned to byte */ + if (0 == (bit_read_idx & 0x3)) { + for (; i < (cols - 3); i += 4) { + + uint8_t w1 = *(_g_bits++); + uint8_t w2 = *(_g_bits++); + + __m128i _write_byte_idx = _mm_srli_epi32(_counter, 3); + __m128i _write_bit_idx = _mm_and_si128(_counter, _7); + __m128i _write_shift = _mm_sub_epi32(_4, _write_bit_idx); + + q_bits[_mm_extract_epi32(_write_byte_idx, 0)] |= (w1 >> 0x4) << _mm_extract_epi32(_write_shift, 0); + q_bits[_mm_extract_epi32(_write_byte_idx, 1)] |= (w1 & 0xf) << _mm_extract_epi32(_write_shift, 1); + q_bits[_mm_extract_epi32(_write_byte_idx, 2)] |= (w2 >> 0x4) << _mm_extract_epi32(_write_shift, 2); + q_bits[_mm_extract_epi32(_write_byte_idx, 3)] |= (w2 & 0xf) << _mm_extract_epi32(_write_shift, 3); + _counter = _mm_add_epi32(_counter, _inc); + } + } else { + for (; i < (cols - 3); i += 4) { + __m128i _write_byte_idx = _mm_srli_epi32(_counter, 3); + __m128i _write_bit_idx = _mm_and_si128(_counter, _7); + __m128i _write_shift = _mm_sub_epi32(_4, _write_bit_idx); + + uint8_t w1 = *(_g_bits); + uint8_t w2 = *(_g_bits++); + uint8_t w3 = *(_g_bits++); + q_bits[_mm_extract_epi32(_write_byte_idx, 0)] |= (w1 & 0xf) << _mm_extract_epi32(_write_shift, 0); + q_bits[_mm_extract_epi32(_write_byte_idx, 1)] |= (w2 >> 0x4) << _mm_extract_epi32(_write_shift, 1); + q_bits[_mm_extract_epi32(_write_byte_idx, 2)] |= (w2 & 0xf) << _mm_extract_epi32(_write_shift, 2); + q_bits[_mm_extract_epi32(_write_byte_idx, 3)] |= (w3 >> 0x4) << _mm_extract_epi32(_write_shift, 3); + + _counter = _mm_add_epi32(_counter, _inc); + } + } + bit_read_idx += i * 4; +#endif /* LV_HAVE_SSE */ + + /* Spare bits */ + for (; i < cols; i++) { + uint32_t k = (i * rows + j) * 4; + + uint32_t read_byte_idx = bit_read_idx / 8; + uint32_t read_bit_idx = bit_read_idx % 8; + uint32_t write_byte_idx = k / 8; + uint32_t write_bit_idx = k % 8; + uint8_t w = (g_bits[read_byte_idx] >> (4 - read_bit_idx)) & (uint8_t) 0x0f; + q_bits[write_byte_idx] |= w << (4 - write_bit_idx); + + bit_read_idx += 4; + } + } + + /* Do rows containing RI */ + for (uint32_t j = ri_min_row; j < rows; j++) { + for (uint32_t i = 0; i < cols; i++) { + uint32_t k = (i * rows + j) * 4; + + if (ri_present[k]) { + /* do nothing */ + } else { + uint32_t read_byte_idx = bit_read_idx / 8; + uint32_t read_bit_idx = bit_read_idx % 8; + uint32_t write_byte_idx = k / 8; + uint32_t write_bit_idx = k % 8; + uint8_t w = (g_bits[read_byte_idx] >> (4 - read_bit_idx)) & (uint8_t) 0x0f; + q_bits[write_byte_idx] |= w << (4 - write_bit_idx); + + bit_read_idx += 4; + } + } + + } +} + +static void ulsch_interleave_qm6(const uint8_t *g_bits, + uint32_t rows, + uint32_t cols, + uint8_t *q_bits, + uint32_t ri_min_row, + const uint8_t *ri_present) { + uint32_t bit_read_idx = 0; + + for (uint32_t j = 0; j < ri_min_row; j++) { + for (uint32_t i = 0; i < cols; i++) { + uint32_t k = (i * rows + j) * 6; + + uint32_t read_byte_idx = bit_read_idx / 8; + uint32_t read_bit_idx = bit_read_idx % 8; + uint32_t write_byte_idx = k / 8; + uint32_t write_bit_idx = k % 8; + uint8_t w; + + switch (read_bit_idx) { + case 0: + w = g_bits[read_byte_idx] >> 2; + break; + case 2: + w = g_bits[read_byte_idx] & (uint8_t) 0x3f; + break; + case 4: + w = ((g_bits[read_byte_idx] << 2) | (g_bits[read_byte_idx + 1] >> 6)) & (uint8_t) 0x3f; + break; + case 6: + w = ((g_bits[read_byte_idx] << 4) | (g_bits[read_byte_idx + 1] >> 4)) & (uint8_t) 0x3f; + break; + default: + w = 0; + } + + switch (write_bit_idx) { + case 0: + q_bits[write_byte_idx] |= w << 2; + break; + case 2: + q_bits[write_byte_idx] |= w; + break; + case 4: + q_bits[write_byte_idx] |= w >> 2; + q_bits[write_byte_idx + 1] |= w << 6; + break; + case 6: + q_bits[write_byte_idx] |= w >> 4; + q_bits[write_byte_idx + 1] |= w << 4; + break; + default: + /* Do nothing */; + } + + bit_read_idx += 6; + } + } + + for (uint32_t j = ri_min_row; j < rows; j++) { + for (uint32_t i = 0; i < cols; i++) { + uint32_t k = (i * rows + j) * 6; + + if (ri_present[k]) { + /* do nothing */ + } else { + uint32_t read_byte_idx = bit_read_idx / 8; + uint32_t read_bit_idx = bit_read_idx % 8; + uint32_t write_byte_idx = k / 8; + uint32_t write_bit_idx = k % 8; + uint8_t w; + + switch (read_bit_idx) { + case 0: + w = g_bits[read_byte_idx] >> 2; + break; + case 2: + w = g_bits[read_byte_idx] & (uint8_t) 0x3f; + break; + case 4: + w = ((g_bits[read_byte_idx] << 2) | (g_bits[read_byte_idx + 1] >> 6)) & (uint8_t) 0x3f; + break; + case 6: + w = ((g_bits[read_byte_idx] << 4) | (g_bits[read_byte_idx + 1] >> 4)) & (uint8_t) 0x3f; + break; + default: + w = 0; + } + + switch (write_bit_idx) { + case 0: + q_bits[write_byte_idx] |= w << 2; + break; + case 2: + q_bits[write_byte_idx] |= w; + break; + case 4: + q_bits[write_byte_idx] |= w >> 2; + q_bits[write_byte_idx + 1] |= w << 6; + break; + case 6: + q_bits[write_byte_idx] |= w >> 4; + q_bits[write_byte_idx + 1] |= w << 4; + break; + default: + /* Do nothing */; + } + + bit_read_idx += 6; + } + } + } +} + /* UL-SCH channel interleaver according to 5.2.2.8 of 36.212 */ void ulsch_interleave(uint8_t *g_bits, uint32_t Qm, uint32_t H_prime_total, uint32_t N_pusch_symbs, uint8_t *q_bits, srslte_uci_bit_t *ri_bits, uint32_t nof_ri_bits, uint8_t *ri_present, uint32_t *inteleaver_lut) { - + + const uint32_t nof_bits = H_prime_total * Qm; + uint32_t rows = H_prime_total / N_pusch_symbs; + uint32_t cols = N_pusch_symbs; + uint32_t ri_min_row = rows; + // Prepare ri_bits for fast search using temp_buffer if (nof_ri_bits > 0) { for (uint32_t i=0;i 0) { for (uint32_t i=0;icur_len); - srslte_vec_prod_sss(data, &s->c_short[offset], data, len); + srslte_vec_neg_sss(data, &s->c_short[offset], data, len); +} + +void srslte_scrambling_sb_offset(srslte_sequence_t *s, int8_t *data, int offset, int len) { + assert (len + offset <= s->cur_len); + srslte_vec_neg_bbb(data, &s->c_char[offset], data, len); } void srslte_scrambling_c(srslte_sequence_t *s, cf_t *data) { diff --git a/lib/src/phy/scrambling/test/scrambling_test.c b/lib/src/phy/scrambling/test/scrambling_test.c index 40722f09b..10a197cc1 100644 --- a/lib/src/phy/scrambling/test/scrambling_test.c +++ b/lib/src/phy/scrambling/test/scrambling_test.c @@ -177,6 +177,83 @@ int main(int argc, char **argv) { free(input_f); free(scrambled_f); + + int16_t *input_s, *scrambled_s; + + // Scramble also shorts + input_s= malloc(sizeof(int16_t) * seq.cur_len); + if (!input_s) { + perror("malloc"); + exit(-1); + } + scrambled_s = malloc(sizeof(int16_t) * seq.cur_len); + if (!scrambled_s) { + perror("malloc"); + exit(-1); + } + + for (i=0;i(&args->expert.phy.pusch_max_its)->default_value(4), + bpo::value(&args->expert.phy.pusch_max_its)->default_value(8), "Maximum number of turbo decoder iterations") - ("expert.tx_amplitude", + ("expert.pusch_8bit_decoder", + bpo::value(&args->expert.phy.pusch_8bit_decoder)->default_value(false), + "Use 8-bit for LLR representation and turbo decoder trellis computation (Experimental)") + + ("expert.tx_amplitude", bpo::value(&args->expert.phy.tx_amplitude)->default_value(0.6), "Transmit amplitude factor") diff --git a/srsenb/src/phy/phch_worker.cc b/srsenb/src/phy/phch_worker.cc index 36a260b6c..b1b6cb67e 100644 --- a/srsenb/src/phy/phch_worker.cc +++ b/srsenb/src/phy/phch_worker.cc @@ -153,7 +153,11 @@ void phch_worker::init(phch_common* phy_, srslte::log *log_h_) srslte_enb_dl_set_amp(&enb_dl, phy->params.tx_amplitude); Info("Worker %d configured cell %d PRB\n", get_id(), phy->cell.nof_prb); - + + if (phy->params.pusch_8bit_decoder) { + enb_ul.pusch.llr_is_8bit = true; + enb_ul.pusch.ul_sch.llr_is_8bit = true; + } initiated = true; running = true; diff --git a/srsue/src/main.cc b/srsue/src/main.cc index cc1638169..8204808b9 100644 --- a/srsue/src/main.cc +++ b/srsue/src/main.cc @@ -214,7 +214,7 @@ void parse_args(all_args_t *args, int argc, char *argv[]) { "Sets the noise estimation algorithm. (Default refs)") ("expert.pdsch_max_its", - bpo::value(&args->expert.phy.pdsch_max_its)->default_value(4), + bpo::value(&args->expert.phy.pdsch_max_its)->default_value(8), "Maximum number of turbo decoder iterations") ("expert.attach_enable_64qam", @@ -307,7 +307,11 @@ void parse_args(all_args_t *args, int argc, char *argv[]) { bpo::value(&args->expert.phy.pdsch_csi_enabled)->default_value(true), "Stores the Channel State Information and uses it for weightening the softbits. It is only used in TM1.") - ("rf_calibration.tx_corr_dc_gain", bpo::value(&args->rf_cal.tx_corr_dc_gain)->default_value(0.0), + ("expert.pdsch_8bit_decoder", + bpo::value(&args->expert.phy.pdsch_8bit_decoder)->default_value(false), + "Use 8-bit for LLR representation and turbo decoder trellis computation (Experimental)") + + ("rf_calibration.tx_corr_dc_gain", bpo::value(&args->rf_cal.tx_corr_dc_gain)->default_value(0.0), "TX DC offset gain correction") ("rf_calibration.tx_corr_dc_phase", bpo::value(&args->rf_cal.tx_corr_dc_phase)->default_value(0.0), "TX DC offset phase correction") diff --git a/srsue/src/phy/phch_worker.cc b/srsue/src/phy/phch_worker.cc index 7c1f8329a..3f688b5b1 100644 --- a/srsue/src/phy/phch_worker.cc +++ b/srsue/src/phy/phch_worker.cc @@ -139,6 +139,10 @@ bool phch_worker::init(uint32_t max_prb, srslte::log *log_h, srslte::log *log_ph return false; } + if (phy->args->pdsch_8bit_decoder) { + ue_dl.pdsch.llr_is_8bit = true; + ue_dl.pdsch.dl_sch.llr_is_8bit = true; + } srslte_chest_dl_set_rsrp_neighbour(&ue_dl.chest, true); srslte_chest_dl_average_subframe(&ue_dl.chest, phy->args->average_subframe_enabled); diff --git a/srsue/ue.conf.example b/srsue/ue.conf.example index 5822ab0c9..4073ad96f 100644 --- a/srsue/ue.conf.example +++ b/srsue/ue.conf.example @@ -204,6 +204,8 @@ enable = false # pdsch_csi_enabled: Stores the Channel State Information and uses it for weightening the softbits. It is only # used in TM1. It is True by default. # +# pdsch_8bit_decoder: Use 8-bit for LLR representation and turbo decoder trellis computation (Experimental) +# ##################################################################### [expert] #ip_netmask = 255.255.255.0 @@ -215,7 +217,7 @@ enable = false #cqi_fixed = 10 #snr_ema_coeff = 0.1 #snr_estim_alg = refs -#pdsch_max_its = 4 +#pdsch_max_its = 8 # These are half iterations #attach_enable_64qam = false #nof_phy_threads = 2 #equalizer_mode = mmse @@ -234,6 +236,7 @@ enable = false #metrics_period_secs = 1 #metrics_csv_filename = /tmp/ue_metrics.csv #pdsch_csi_enabled = true +#pdsch_8bit_decoder = false # CFO related values #cfo_is_doppler = false