New optimization on the PHY for both UE and eNodeB (#251)

* New parallel Turbodecoder implementation in SSE/AVX 16-bit and 8-bit

* Optimised UL Interleaver

* Include TB CRC calculation in FEC encoder

* New threading priorities
master
Ismael Gomez 6 years ago committed by GitHub
parent 695990f297
commit bc9d342959
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -279,8 +279,8 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ggdb -DBUILD_TYPE_RELWITHDEBINFO") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ggdb -DBUILD_TYPE_RELWITHDEBINFO")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -DBUILD_TYPE_RELWITHDEBINFO") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -DBUILD_TYPE_RELWITHDEBINFO")
else(${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") else(${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -DBUILD_TYPE_RELEASE") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fno-trapping-math -fno-math-errno -DBUILD_TYPE_RELEASE")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -DBUILD_TYPE_RELEASE") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fno-trapping-math -fno-math-errno -DBUILD_TYPE_RELEASE")
endif(${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") endif(${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
endif(${CMAKE_BUILD_TYPE} STREQUAL "Debug") endif(${CMAKE_BUILD_TYPE} STREQUAL "Debug")

@ -52,6 +52,14 @@
#endif #endif
// Useful macros for templates
#define CONCAT(a, b) a##b
#define CONCAT2(a, b) CONCAT(a,b)
#define STRING2(x) #x
#define STRING(x) STRING2(x)
// Common error codes // Common error codes
#define SRSLTE_SUCCESS 0 #define SRSLTE_SUCCESS 0
#define SRSLTE_ERROR -1 #define SRSLTE_ERROR -1

@ -545,6 +545,7 @@ typedef struct {
bool sic_pss_enabled; bool sic_pss_enabled;
float rx_gain_offset; float rx_gain_offset;
bool pdsch_csi_enabled; bool pdsch_csi_enabled;
bool pdsch_8bit_decoder;
uint32_t intra_freq_meas_len_ms; uint32_t intra_freq_meas_len_ms;
uint32_t intra_freq_meas_period_ms; uint32_t intra_freq_meas_period_ms;
} phy_args_t; } phy_args_t;

@ -44,6 +44,7 @@ typedef struct SRSLTE_API {
uint8_t *c_bytes; uint8_t *c_bytes;
float *c_float; float *c_float;
short *c_short; short *c_short;
int8_t *c_char;
uint32_t cur_len; uint32_t cur_len;
uint32_t max_len; uint32_t max_len;
} srslte_sequence_t; } srslte_sequence_t;

@ -36,6 +36,7 @@
#define SRSLTE_RM_TURBO_H #define SRSLTE_RM_TURBO_H
#include "srslte/config.h" #include "srslte/config.h"
#include "srslte/phy/fec/turbodecoder.h"
#ifndef SRSLTE_RX_NULL #ifndef SRSLTE_RX_NULL
#define SRSLTE_RX_NULL 10000 #define SRSLTE_RX_NULL 10000
@ -47,7 +48,6 @@
#include "srslte/config.h" #include "srslte/config.h"
SRSLTE_API int srslte_rm_turbo_tx(uint8_t *w_buff, SRSLTE_API int srslte_rm_turbo_tx(uint8_t *w_buff,
uint32_t buff_len, uint32_t buff_len,
uint8_t *input, uint8_t *input,
@ -84,5 +84,17 @@ SRSLTE_API int srslte_rm_turbo_rx_lut(int16_t *input,
uint32_t cb_idx, uint32_t cb_idx,
uint32_t rv_idx); uint32_t rv_idx);
SRSLTE_API int srslte_rm_turbo_rx_lut_(int16_t *input,
int16_t *output,
uint32_t in_len,
uint32_t cb_idx,
uint32_t rv_idx,
bool enable_input_tdec);
SRSLTE_API int srslte_rm_turbo_rx_lut_8bit(int8_t *input,
int8_t *output,
uint32_t in_len,
uint32_t cb_idx,
uint32_t rv_idx);
#endif // SRSLTE_RM_TURBO_H #endif // SRSLTE_RM_TURBO_H

@ -47,8 +47,9 @@ typedef struct SRSLTE_API {
SRSLTE_API int srslte_tc_interl_LTE_gen(srslte_tc_interl_t *h, SRSLTE_API int srslte_tc_interl_LTE_gen(srslte_tc_interl_t *h,
uint32_t long_cb); uint32_t long_cb);
SRSLTE_API int srslte_tc_interl_UMTS_gen(srslte_tc_interl_t *h, SRSLTE_API int srslte_tc_interl_LTE_gen_interl(srslte_tc_interl_t *h,
uint32_t long_cb); uint32_t long_cb,
uint32_t interl_win);
SRSLTE_API int srslte_tc_interl_init(srslte_tc_interl_t *h, SRSLTE_API int srslte_tc_interl_init(srslte_tc_interl_t *h,
uint32_t max_long_cb); uint32_t max_long_cb);

@ -70,10 +70,12 @@ SRSLTE_API int srslte_tcod_encode(srslte_tcod_t *h,
uint32_t long_cb); uint32_t long_cb);
SRSLTE_API int srslte_tcod_encode_lut(srslte_tcod_t *h, SRSLTE_API int srslte_tcod_encode_lut(srslte_tcod_t *h,
srslte_crc_t *crc, srslte_crc_t *crc_tb,
srslte_crc_t *crc_cb,
uint8_t *input, uint8_t *input,
uint8_t *parity, uint8_t *parity,
uint32_t cblen_idx); uint32_t cblen_idx,
bool last_cb);
SRSLTE_API void srslte_tcod_gentable(); SRSLTE_API void srslte_tcod_gentable();

@ -47,46 +47,81 @@
#define SRSLTE_TCOD_TOTALTAIL 12 #define SRSLTE_TCOD_TOTALTAIL 12
#define SRSLTE_TCOD_MAX_LEN_CB 6144 #define SRSLTE_TCOD_MAX_LEN_CB 6144
#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL)
#include "srslte/phy/fec/turbodecoder_gen.h" // Expect the input to be aligned for sub-block window processing.
#include "srslte/phy/fec/turbodecoder_simd.h" #define SRSLTE_TDEC_EXPECT_INPUT_SB 1
// Include interfaces for 8 and 16 bit decoder implementations
#define LLR_IS_8BIT
#include "srslte/phy/fec/turbodecoder_impl.h"
#undef LLR_IS_8BIT
#define LLR_IS_16BIT
#include "srslte/phy/fec/turbodecoder_impl.h"
#undef LLR_IS_16BIT
#define SRSLTE_TDEC_NOF_AUTO_MODES_8 2
#define SRSLTE_TDEC_NOF_AUTO_MODES_16 3
typedef enum {SRSLTE_TDEC_8, SRSLTE_TDEC_16} srslte_tdec_llr_type_t;
typedef struct SRSLTE_API { typedef struct SRSLTE_API {
float *input_conv; uint32_t max_long_cb;
union {
srslte_tdec_simd_t tdec_simd; void *dec8_hdlr[SRSLTE_TDEC_NOF_AUTO_MODES_8];
srslte_tdec_gen_t tdec_gen; void *dec16_hdlr[SRSLTE_TDEC_NOF_AUTO_MODES_16];
}; srslte_tdec_8bit_impl_t *dec8[SRSLTE_TDEC_NOF_AUTO_MODES_8];
srslte_tdec_16bit_impl_t *dec16[SRSLTE_TDEC_NOF_AUTO_MODES_16];
int nof_blocks8[SRSLTE_TDEC_NOF_AUTO_MODES_8];
int nof_blocks16[SRSLTE_TDEC_NOF_AUTO_MODES_16];
// Declare as void types as can be int8 or int16
void *app1;
void *app2;
void *ext1;
void *ext2;
void *syst0;
void *parity0;
void *parity1;
void *input_conv;
bool force_not_sb;
srslte_tdec_impl_type_t dec_type;
srslte_tdec_llr_type_t current_llr_type;
uint32_t current_dec;
uint32_t current_long_cb;
uint32_t current_inter_idx;
int current_cbidx;
srslte_tc_interl_t interleaver[4][SRSLTE_NOF_TC_CB_SIZES];
int n_iter;
} srslte_tdec_t; } srslte_tdec_t;
SRSLTE_API int srslte_tdec_init(srslte_tdec_t * h, SRSLTE_API int srslte_tdec_init(srslte_tdec_t * h,
uint32_t max_long_cb); uint32_t max_long_cb);
SRSLTE_API int srslte_tdec_init_manual(srslte_tdec_t * h,
uint32_t max_long_cb,
srslte_tdec_impl_type_t dec_type);
SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h); SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h);
SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, SRSLTE_API void srslte_tdec_force_not_sb(srslte_tdec_t *h);
SRSLTE_API int srslte_tdec_new_cb(srslte_tdec_t * h,
uint32_t long_cb); uint32_t long_cb);
SRSLTE_API int srslte_tdec_reset_cb(srslte_tdec_t * h, SRSLTE_API int srslte_tdec_get_nof_iterations(srslte_tdec_t * h);
uint32_t cb_idx);
SRSLTE_API int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h, SRSLTE_API uint32_t srslte_tdec_autoimp_get_subblocks(uint32_t long_cb);
uint32_t cb_idx);
SRSLTE_API uint32_t srslte_tdec_get_nof_parallel(srslte_tdec_t * h); SRSLTE_API uint32_t srslte_tdec_autoimp_get_subblocks_8bit(uint32_t long_cb);
SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h, SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h,
int16_t* input, int16_t* input,
uint32_t long_cb); uint8_t *output);
SRSLTE_API void srslte_tdec_decision(srslte_tdec_t * h,
uint8_t *output,
uint32_t long_cb);
SRSLTE_API void srslte_tdec_decision_byte(srslte_tdec_t * h,
uint8_t *output,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h, SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h,
int16_t * input, int16_t * input,
@ -94,27 +129,15 @@ SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h,
uint32_t nof_iterations, uint32_t nof_iterations,
uint32_t long_cb); uint32_t long_cb);
SRSLTE_API void srslte_tdec_iteration_par(srslte_tdec_t * h, SRSLTE_API void srslte_tdec_iteration_8bit(srslte_tdec_t * h,
int16_t* input[SRSLTE_TDEC_MAX_NPAR], int8_t* input,
uint32_t long_cb); uint8_t *output);
SRSLTE_API void srslte_tdec_decision_par(srslte_tdec_t * h,
uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t long_cb);
SRSLTE_API void srslte_tdec_decision_byte_par(srslte_tdec_t * h,
uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t long_cb);
SRSLTE_API void srslte_tdec_decision_byte_par_cb(srslte_tdec_t * h, SRSLTE_API int srslte_tdec_run_all_8bit(srslte_tdec_t * h,
int8_t * input,
uint8_t *output, uint8_t *output,
uint32_t cb_idx,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_run_all_par(srslte_tdec_t * h,
int16_t * input[SRSLTE_TDEC_MAX_NPAR],
uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t nof_iterations, uint32_t nof_iterations,
uint32_t long_cb); uint32_t long_cb);
#endif // SRSLTE_TURBODECODER_H #endif // SRSLTE_TURBODECODER_H

@ -47,53 +47,16 @@
#define SRSLTE_TCOD_TOTALTAIL 12 #define SRSLTE_TCOD_TOTALTAIL 12
#define SRSLTE_TCOD_MAX_LEN_CB 6144 #define SRSLTE_TCOD_MAX_LEN_CB 6144
#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL)
typedef struct SRSLTE_API { typedef struct SRSLTE_API {
int max_long_cb; uint32_t max_long_cb;
float *beta; int16_t *beta;
} srslte_map_gen_vl_t; } tdec_gen_t;
typedef struct SRSLTE_API { int tdec_gen_init(void **h, uint32_t max_long_cb);
int max_long_cb; void tdec_gen_free(void *h);
void tdec_gen_dec(void *h, int16_t * input, int16_t *app, int16_t * parity, int16_t *output, uint32_t long_cb);
srslte_map_gen_vl_t dec; void tdec_gen_extract_input(int16_t *input, int16_t *syst, int16_t *parity0, int16_t *parity1, int16_t *app2, uint32_t long_cb);
void tdec_gen_decision_byte(int16_t *app1, uint8_t *output, uint32_t long_cb);
float *llr1;
float *llr2;
float *w;
float *syst;
float *parity;
int current_cbidx;
uint32_t current_cb_len;
uint32_t n_iter;
srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
} srslte_tdec_gen_t;
SRSLTE_API int srslte_tdec_gen_init(srslte_tdec_gen_t * h,
uint32_t max_long_cb);
SRSLTE_API void srslte_tdec_gen_free(srslte_tdec_gen_t * h);
SRSLTE_API int srslte_tdec_gen_reset(srslte_tdec_gen_t * h, uint32_t long_cb);
SRSLTE_API void srslte_tdec_gen_iteration(srslte_tdec_gen_t * h,
float * input,
uint32_t long_cb);
SRSLTE_API void srslte_tdec_gen_decision(srslte_tdec_gen_t * h,
uint8_t *output,
uint32_t long_cb);
SRSLTE_API void srslte_tdec_gen_decision_byte(srslte_tdec_gen_t * h,
uint8_t *output,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_gen_run_all(srslte_tdec_gen_t * h,
float * input,
uint8_t *output,
uint32_t nof_iterations,
uint32_t long_cb);
#endif // SRSLTE_TURBODECODER_GEN_H #endif // SRSLTE_TURBODECODER_GEN_H

@ -0,0 +1,68 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#ifndef SRSLTE_TURBODECODER_IMPL_H
#define SRSLTE_TURBODECODER_IMPL_H
#include "srslte/config.h"
/* Interface for internal decoder implementation */
typedef enum SRSLTE_API {
SRSLTE_TDEC_AUTO = 0,
SRSLTE_TDEC_GENERIC,
SRSLTE_TDEC_SSE,
SRSLTE_TDEC_SSE_WINDOW,
SRSLTE_TDEC_AVX_WINDOW,
SRSLTE_TDEC_SSE8_WINDOW,
SRSLTE_TDEC_AVX8_WINDOW,
SRSLTE_TDEC_NOF_IMP
} srslte_tdec_impl_type_t;
#endif
#ifdef LLR_IS_8BIT
#define llr_t int8_t
#define type_name srslte_tdec_8bit_impl_t
#else
#ifdef LLR_IS_16BIT
#define llr_t int16_t
#define type_name srslte_tdec_16bit_impl_t
#else
#error "Unsupported LLR mode"
#endif
#endif
typedef struct SRSLTE_API {
int (*tdec_init)(void **h, uint32_t max_long_cb);
void (*tdec_free)(void *h);
void (*tdec_dec)(void *h, llr_t * input, llr_t *app, llr_t * parity, llr_t *output, uint32_t long_cb);
void (*tdec_extract_input)(llr_t *input, llr_t *syst, llr_t *parity0, llr_t *parity1, llr_t *app2, uint32_t long_cb);
void (*tdec_decision_byte)(llr_t *app1, uint8_t *output, uint32_t long_cb);
} type_name;
#undef llr_t
#undef type_name

@ -0,0 +1,158 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include "srslte/config.h"
#define MAKE_CALL(a) CONCAT2(a,type_name)
#define MAKE_VEC(a) CONCAT2(a,vec_suffix)
#define PRINT CONCAT2(srslte_vec_fprint,print_suffix)
#ifdef LLR_IS_8BIT
#define llr_t int8_t
#define type_name _8bit
#define vec_suffix _bbb
#define print_suffix _bs
#define decptr h->dec8[h->current_dec]
#define dechdlr h->dec8_hdlr[h->current_dec]
#define input_is_interleaved 1
#else
#ifdef LLR_IS_16BIT
#define llr_t int16_t
#define vec_suffix _sss
#define print_suffix _s
#define decptr h->dec16[h->current_dec]
#define dechdlr h->dec16_hdlr[h->current_dec]
#define input_is_interleaved (h->current_dec > 0)
#define type_name _16bit
#else
#warning "Unsupported LLR mode"
#endif
#endif
#define debug_enabled_iter 0
#define debug_len 20
#define debug_vec(a) if (debug_enabled_iter) {printf("%s it=%d: ", STRING(a), n_iter);PRINT(stdout, a, debug_len);}
static void MAKE_CALL(extract_input_tail_sb)(llr_t *input, llr_t *syst, llr_t *app2, llr_t *parity0, llr_t *parity1, uint32_t long_cb)
{
for (int i = long_cb; i < long_cb + 3; i++) {
syst[i] = input[3*(long_cb+32) + 2*(i - long_cb)];
parity0[i] = input[3*(long_cb+32)+ 2*(i - long_cb) + 1];
app2[i] = input[3*(long_cb+32) + 6 + 2*(i - long_cb)];
parity1[i] = input[3*(long_cb+32) + 6 + 2*(i - long_cb) + 1];
}
}
/* Runs 1 turbo decoder iteration */
void MAKE_CALL(run_tdec_iteration)(srslte_tdec_t * h, llr_t * input)
{
if (h->current_cbidx >= 0) {
uint16_t *inter = h->interleaver[h->current_inter_idx][h->current_cbidx].forward;
uint16_t *deinter = h->interleaver[h->current_inter_idx][h->current_cbidx].reverse;
llr_t *syst = (llr_t*) h->syst0;
llr_t *parity0 = (llr_t*) h->parity0;
llr_t *parity1 = (llr_t*) h->parity1;
llr_t *app1 = (llr_t*) h->app1;
llr_t *app2 = (llr_t*) h->app2;
llr_t *ext1 = (llr_t*) h->ext1;
llr_t *ext2 = (llr_t*) h->ext2;
uint32_t long_cb = h->current_long_cb;
uint32_t n_iter = h->n_iter;
if (SRSLTE_TDEC_EXPECT_INPUT_SB && !h->force_not_sb && input_is_interleaved) {
syst = input;
// align to 32 bytes (warning: must be same alignment as in rm_turbo.c)
parity0 = &input[long_cb+32];
parity1 = &input[2*(long_cb+32)];
if (n_iter == 0) {
MAKE_CALL(extract_input_tail_sb)(input, syst, app2, parity0, parity1, long_cb);
}
} else {
if (n_iter == 0) {
decptr->tdec_extract_input(input, syst, app2, parity0, parity1, long_cb);
}
}
if ((n_iter%2) == 0) {
// Add apriori information to decoder 1
if (n_iter) {
MAKE_VEC(srslte_vec_sub)(app1, ext1, app1, long_cb);
}
// Run MAP DEC #1
decptr->tdec_dec(dechdlr, syst, n_iter ? app1 : NULL, parity0, ext1, long_cb);
}
// Interleave extrinsic output of DEC1 to form apriori info for decoder 2
if (n_iter%2) {
// Convert aposteriori information into extrinsic information
if (n_iter > 1) {
MAKE_VEC(srslte_vec_sub)(ext1, app1, ext1, long_cb);
}
MAKE_VEC(srslte_vec_lut)(ext1, deinter, app2, long_cb);
// Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits
decptr->tdec_dec(dechdlr, app2, NULL, parity1, ext2, long_cb);
// Deinterleaved extrinsic bits become apriori info for decoder 1
MAKE_VEC(srslte_vec_lut)(ext2, inter, app1, long_cb);
}
if (h->n_iter == 0) {
debug_vec(syst);
debug_vec(parity0);
debug_vec(parity1);
}
debug_vec(ext1);
debug_vec(ext2);
debug_vec(app1);
debug_vec(app2);
h->n_iter++;
} else {
fprintf(stderr, "Error CB index not set (call srslte_tdec_new_cb() first\n");
}
}
#undef debug_enabled
#undef debug_len
#undef debug_vec
#undef llr_t
#undef vec_suffix
#undef print_suffix
#undef decptr
#undef dechdlr
#undef type_name
#undef input_is_interleaved

@ -1,122 +0,0 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
/**********************************************************************************************
* File: turbodecoder.h
*
* Description: Turbo Decoder.
* Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
* encoders and one turbo code internal interleaver. The coding rate of turbo
* encoder is 1/3.
* MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
*
* Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
*********************************************************************************************/
#ifndef SRSLTE_TURBODECODER_SIMD_H
#define SRSLTE_TURBODECODER_SIMD_H
#include "srslte/config.h"
#include "srslte/phy/fec/tc_interl.h"
#include "srslte/phy/fec/cbsegm.h"
// Define maximum number of CB decoded in parallel (2 for AVX2)
#define SRSLTE_TDEC_MAX_NPAR 2
#define SRSLTE_TCOD_RATE 3
#define SRSLTE_TCOD_TOTALTAIL 12
#define SRSLTE_TCOD_MAX_LEN_CB 6144
#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL)
typedef struct SRSLTE_API {
uint32_t max_long_cb;
uint32_t max_par_cb;
int16_t *alpha;
int16_t *branch;
} map_gen_t;
typedef struct SRSLTE_API {
uint32_t max_long_cb;
uint32_t max_par_cb;
map_gen_t dec;
int16_t *app1[SRSLTE_TDEC_MAX_NPAR];
int16_t *app2[SRSLTE_TDEC_MAX_NPAR];
int16_t *ext1[SRSLTE_TDEC_MAX_NPAR];
int16_t *ext2[SRSLTE_TDEC_MAX_NPAR];
int16_t *syst[SRSLTE_TDEC_MAX_NPAR];
int16_t *parity0[SRSLTE_TDEC_MAX_NPAR];
int16_t *parity1[SRSLTE_TDEC_MAX_NPAR];
int cb_mask;
int current_cbidx;
srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
int n_iter[SRSLTE_TDEC_MAX_NPAR];
} srslte_tdec_simd_t;
SRSLTE_API int srslte_tdec_simd_init(srslte_tdec_simd_t * h,
uint32_t max_par_cb,
uint32_t max_long_cb);
SRSLTE_API void srslte_tdec_simd_free(srslte_tdec_simd_t * h);
SRSLTE_API int srslte_tdec_simd_reset(srslte_tdec_simd_t * h,
uint32_t long_cb);
SRSLTE_API
SRSLTE_API int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h,
uint32_t cb_idx);
SRSLTE_API int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h,
uint32_t cb_idx);
SRSLTE_API void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h,
int16_t * input[SRSLTE_TDEC_MAX_NPAR],
uint32_t long_cb);
SRSLTE_API void srslte_tdec_simd_decision(srslte_tdec_simd_t * h,
uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t long_cb);
SRSLTE_API void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h,
uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t long_cb);
SRSLTE_API void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h,
uint8_t *output,
uint32_t cbidx,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h,
int16_t * input[SRSLTE_TDEC_MAX_NPAR],
uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t nof_iterations,
uint32_t long_cb);
#endif // SRSLTE_TURBODECODER_SIMD_H

@ -1,119 +0,0 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
/**********************************************************************************************
* File: turbodecoder.h
*
* Description: Turbo Decoder.
* Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
* encoders and one turbo code internal interleaver. The coding rate of turbo
* encoder is 1/3.
* MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
*
* Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
*********************************************************************************************/
#ifndef SRSLTE_TURBODECODER_SIMD_INTER_H
#define SRSLTE_TURBODECODER_SIMD_INTER_H
/** This is an simd inter-frame parallel turbo decoder. Parallizes 8 code-blocks using SSE
* This implementation is currently not functional and not used by the rest of the code
*/
#include "srslte/config.h"
#include "srslte/phy/fec/tc_interl.h"
#include "srslte/phy/fec/cbsegm.h"
#if LV_HAVE_AVX2
#define SRSLTE_TDEC_MAX_NPAR 16
#else
#define SRSLTE_TDEC_MAX_NPAR 8
#endif
typedef struct SRSLTE_API {
int max_long_cb;
int16_t *syst0;
int16_t *parity0;
int16_t *syst1;
int16_t *parity1;
int16_t *llr1;
int16_t *llr2;
int16_t *w;
int16_t *alpha;
uint32_t max_par_cb;
int current_cbidx;
uint32_t current_long_cb;
srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
int n_iter[SRSLTE_TDEC_MAX_NPAR];
} srslte_tdec_simd_inter_t;
SRSLTE_API int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h,
uint32_t max_par_cb,
uint32_t max_long_cb);
SRSLTE_API void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h);
SRSLTE_API int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_simd_inter_get_nof_iterations_cb(srslte_tdec_simd_inter_t * h,
uint32_t cb_idx);
SRSLTE_API int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h,
uint32_t cb_idx);
SRSLTE_API void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h,
int16_t * input[SRSLTE_TDEC_MAX_NPAR],
uint32_t nof_cb,
uint32_t long_cb);
SRSLTE_API void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h,
uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t nof_cb,
uint32_t long_cb);
SRSLTE_API void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h,
uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t nof_cb,
uint32_t long_cb);
SRSLTE_API void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h,
uint8_t *output,
uint32_t cbidx,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h,
int16_t *input[SRSLTE_TDEC_MAX_NPAR],
uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t nof_iterations,
uint32_t nof_cb,
uint32_t long_cb);
#endif // SRSLTE_TURBODECODER_SIMD_INTER_H

@ -24,78 +24,22 @@
* *
*/ */
/********************************************************************************************** #ifndef SRSLTE_TURBODECODER_SSE_H
* File: turbodecoder.h #define SRSLTE_TURBODECODER_SSE_H
*
* Description: Turbo Decoder.
* Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
* encoders and one turbo code internal interleaver. The coding rate of turbo
* encoder is 1/3.
* MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
*
* Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
*********************************************************************************************/
#ifndef SRSLTE_TURBODECODER_SSE_
#define SRSLTE_TURBODECODER_SSE_
#include "srslte/config.h" #include "srslte/config.h"
#include "srslte/phy/fec/tc_interl.h"
#include "srslte/phy/fec/cbsegm.h"
#define SRSLTE_TCOD_RATE 3
#define SRSLTE_TCOD_TOTALTAIL 12
#define SRSLTE_TCOD_MAX_LEN_CB 6144
#define SRSLTE_TCOD_MAX_LEN_CODED (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL)
typedef struct SRSLTE_API { typedef struct SRSLTE_API {
int max_long_cb; uint32_t max_long_cb;
int16_t *alpha; int16_t *alpha;
int16_t *branch; int16_t *branch;
} map_gen_t; } tdec_sse_t;
typedef struct SRSLTE_API {
int max_long_cb;
map_gen_t dec;
int16_t *app1;
int16_t *app2;
int16_t *ext1;
int16_t *ext2;
int16_t *syst;
int16_t *parity0;
int16_t *parity1;
int current_cbidx;
srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
int n_iter;
} srslte_tdec_sse_t;
SRSLTE_API int srslte_tdec_sse_init(srslte_tdec_sse_t * h,
uint32_t max_long_cb);
SRSLTE_API void srslte_tdec_sse_free(srslte_tdec_sse_t * h);
SRSLTE_API int srslte_tdec_sse_reset(srslte_tdec_sse_t * h, uint32_t long_cb);
SRSLTE_API void srslte_tdec_sse_iteration(srslte_tdec_sse_t * h,
int16_t * input,
uint32_t long_cb);
SRSLTE_API void srslte_tdec_sse_decision(srslte_tdec_sse_t * h,
uint8_t *output,
uint32_t long_cb);
SRSLTE_API void srslte_tdec_sse_decision_byte(srslte_tdec_sse_t * h,
uint8_t *output,
uint32_t long_cb);
SRSLTE_API int srslte_tdec_sse_run_all(srslte_tdec_sse_t * h, int tdec_sse_init(void **h, uint32_t max_long_cb);
int16_t * input, void tdec_sse_free(void *h);
uint8_t *output, void tdec_sse_dec(void *h, int16_t * input, int16_t *app, int16_t * parity,
uint32_t nof_iterations, int16_t *output, uint32_t long_cb);
uint32_t long_cb); void tdec_sse_extract_input(int16_t *input, int16_t *syst, int16_t *parity0, int16_t *parity1, int16_t *app2, uint32_t long_cb);
void tdec_sse_decision_byte(int16_t *app1, uint8_t *output, uint32_t long_cb);
#endif // SRSLTE_TURBODECODER_SSE_ #endif // SRSLTE_TURBODECODER_SSE_H

@ -0,0 +1,750 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include "srslte/config.h"
#define MAKE_FUNC(a) CONCAT2(CONCAT2(tdec_win,WINIMP),CONCAT2(_,a))
#define MAKE_TYPE CONCAT2(CONCAT2(tdec_win_,WINIMP),_t)
#ifdef WINIMP_IS_SSE16
#ifndef LV_HAVE_SSE
#error "Selected SSE window decoder but instruction set not supported"
#endif
#include <nmmintrin.h>
#define WINIMP sse16
#define nof_blocks 8
#define llr_t int16_t
#define simd_type_t __m128i
#define simd_load _mm_load_si128
#define simd_store _mm_store_si128
#define simd_add _mm_adds_epi16
#define simd_sub _mm_subs_epi16
#define simd_max _mm_max_epi16
#define simd_set1 _mm_set1_epi16
#define simd_insert _mm_insert_epi16
#define simd_shuffle _mm_shuffle_epi8
#define move_right _mm_set_epi8(15,14,15,14,13,12,11,10,9,8,7,6,5,4,3,2)
#define move_left _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,1,0)
#define simd_rb_shift _mm_srai_epi16
#define normalize_period 2
#define win_overlap_len 40
#define divide_output 1
#define INF 10000
#else
#ifdef WINIMP_IS_AVX16
#ifndef LV_HAVE_AVX
#error "Selected AVX window decoder but instruction set not supported"
#endif
#include <immintrin.h>
#define WINIMP avx16
#define nof_blocks 16
#define llr_t int16_t
#define simd_type_t __m256i
#define simd_load _mm256_load_si256
#define simd_store _mm256_store_si256
#define simd_add _mm256_adds_epi16
#define simd_sub _mm256_subs_epi16
#define simd_max _mm256_max_epi16
#define simd_set1 _mm256_set1_epi16
#define simd_insert _mm256_insert_epi16
#define simd_shuffle _mm256_shuffle_epi8
#define move_right _mm256_set_epi8(31,30,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)
#define move_left _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,1,0)
#define normalize_period 2
#define win_overlap_len 40
#define INF 10000
#else
#ifdef WINIMP_IS_SSE8
#ifndef LV_HAVE_SSE
#error "Selected SSE window decoder but instruction set not supported"
#endif
#include <nmmintrin.h>
#define WINIMP sse8
#define nof_blocks 16
#define llr_t int8_t
#define simd_type_t __m128i
#define simd_load _mm_load_si128
#define simd_store _mm_store_si128
#define simd_add _mm_adds_epi8
#define simd_sub _mm_subs_epi8
#define simd_max _mm_max_epi8
#define simd_set1 _mm_set1_epi8
#define simd_insert _mm_insert_epi8
#define simd_shuffle _mm_shuffle_epi8
#define move_right _mm_set_epi8(15,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)
#define move_left _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0)
#define simd_rb_shift simd_rb_shift_128
#define normalize_max
#define normalize_period 1
#define win_overlap_len 40
#define use_saturated_add
#define divide_output 1
#define INF 0
inline static simd_type_t simd_rb_shift_128(simd_type_t v, const int l) {
__m128i low = _mm_srai_epi16(_mm_slli_epi16(v,8), l+8);
__m128i hi = _mm_srai_epi16(v,l);
return _mm_blendv_epi8(hi, low, _mm_set1_epi32(0x00FF00FF));
}
#else
#ifdef WINIMP_IS_AVX8
#ifndef LV_HAVE_AVX
#error "Selected AVX window decoder but instruction set not supported"
#endif
#include <immintrin.h>
#define WINIMP avx8
#define nof_blocks 32
#define llr_t int8_t
#define simd_type_t __m256i
#define simd_load _mm256_load_si256
#define simd_store _mm256_store_si256
#define simd_add _mm256_adds_epi8
#define simd_sub _mm256_subs_epi8
#define simd_max _mm256_max_epi8
#define simd_set1 _mm256_set1_epi8
#define simd_insert _mm256_insert_epi8
#define simd_shuffle _mm256_shuffle_epi8
#define move_right _mm256_set_epi8(31,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)
#define move_left _mm256_set_epi8(30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0)
#define simd_rb_shift simd_rb_shift_256
#define INF 0
#define normalize_max
#define normalize_period 1
#define win_overlap_len 40
#define use_saturated_add
#define divide_output 1
inline static simd_type_t simd_rb_shift_256(simd_type_t v, const int l) {
__m256i low = _mm256_srai_epi16(_mm256_slli_epi16(v,8), l+8);
__m256i hi = _mm256_srai_epi16(v,l);
return _mm256_blendv_epi8(hi, low, _mm256_set1_epi32(0x00FF00FF));
}
#else
#error "Unknown WINIMP value"
#endif
#endif
#endif
#endif
typedef struct SRSLTE_API {
uint32_t max_long_cb;
llr_t *beta;
} MAKE_TYPE;
#define long_sb (long_cb/nof_blocks)
#define debug_enabled_win 0
#if debug_enabled_win
#define debug_state(d) printf("k=%5d, in=%5d, pa=%3d, out=%5d, alpha=[", d*long_sb+k+1, MAKE_FUNC(get_simd)(x,d), MAKE_FUNC(get_simd)(y,d), MAKE_FUNC(get_simd)(out,d)); \
for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(old[j],d)); \
printf("], beta=["); \
for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(beta_save[j], d));printf("\n");
#define debug_state_pre(d) printf("pre-window k=%5d, in=%5d, pa=%3d, alpha=[", (d+1)*long_sb-loop_len+k+1, MAKE_FUNC(get_simd)(x,d), MAKE_FUNC(get_simd)(y,d)); \
for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(old[j],d)); \
printf("]\n");
#define debug_state_beta(d) printf("k=%5d, in=%5d, pa=%3d, beta=[", d*long_sb+k, MAKE_FUNC(get_simd)(x,d), MAKE_FUNC(get_simd)(y,d)); \
for (int j=0;j<8;j++) printf("%5d, ", MAKE_FUNC(get_simd)(old[j],d));\
printf("\n");
static llr_t MAKE_FUNC(get_simd)(simd_type_t x, uint32_t pos) {
llr_t *s = (llr_t*) &x;
return s[pos];
}
#else
#define debug_state(a)
#define debug_state_pre(a)
#define debug_state_beta(a)
#endif
/*
static void MAKE_FUNC(print_simd)(simd_type_t x) {
llr_t *s = (llr_t*) &x;
printf("[");
for (int i=0;i<nof_blocks;i++) {
printf("%4d, ", s[i]);
}
printf("]\n");
}*/
inline static llr_t MAKE_FUNC(sadd)(llr_t x, llr_t y) {
#ifndef use_saturated_add
return x+y;
#else
int16_t z = (int16_t) x+y;
return z>127?127:(int8_t) z;
#endif
}
inline static void MAKE_FUNC(normalize)(uint32_t k, simd_type_t old[8]) {
if ((k % normalize_period) == 0 && k != 0) {
#ifdef normalize_max
simd_type_t m = simd_max(old[0],old[1]);
for (int i=2;i<8;i++) {
m = simd_max(m,old[i]);
}
for (int i=0;i<8;i++) {
old[i] = simd_sub(old[i], m);
}
#else
for (int i = 1; i < 8; i++) {
old[i] = simd_sub(old[i], old[0]);
}
old[0] = simd_set1(0);
#endif
}
}
static void MAKE_FUNC(beta_trellis)(llr_t *input, llr_t *parity, uint32_t long_cb, llr_t old[8])
{
llr_t m_b[8], new[8];
llr_t x, y, xy;
/* Calculate last state using Tail. No need to use SIMD here */
old[0] = 0;
for (int i = 1; i < 8; i++) {
old[i] = -INF;
}
for (int k=long_cb+2;k >= long_cb; k--) {
x = input[k];
y = parity[k];
xy = MAKE_FUNC(sadd)(x, y);
m_b[0] = MAKE_FUNC(sadd)(old[4],xy);
m_b[1] = old[4];
m_b[2] = MAKE_FUNC(sadd)(old[5], y);
m_b[3] = MAKE_FUNC(sadd)(old[5], x);
m_b[4] = MAKE_FUNC(sadd)(old[6], x);
m_b[5] = MAKE_FUNC(sadd)(old[6], y);
m_b[6] = old[7];
m_b[7] = MAKE_FUNC(sadd)(old[7], xy);
new[0] = old[0];
new[1] = MAKE_FUNC(sadd)(old[0], xy);
new[2] = MAKE_FUNC(sadd)(old[1], x);
new[3] = MAKE_FUNC(sadd)(old[1], y);
new[4] = MAKE_FUNC(sadd)(old[2], y);
new[5] = MAKE_FUNC(sadd)(old[2], x);
new[6] = MAKE_FUNC(sadd)(old[3], xy);
new[7] = old[3];
#if debug_enabled_win
printf("trellis: k=%d, in=%d, pa=%d, beta: ", k, x, y); for (int i=0;i<8;i++) {printf("%d,", old[i]);} printf("\n");
#endif
for (int i = 0; i < 8; i++) {
if (m_b[i] > new[i])
new[i] = m_b[i];
old[i] = new[i];
}
}
}
/* Computes beta values */
static void MAKE_FUNC(beta)(MAKE_TYPE * s, llr_t *input, llr_t *app, llr_t *parity, uint32_t long_cb)
{
simd_type_t m_b[8], new[8], old[8];
simd_type_t x, y, xy, ap;
simd_type_t *inputPtr;
simd_type_t *appPtr;
simd_type_t *parityPtr;
simd_type_t *betaPtr = (simd_type_t*) s->beta;
uint32_t loop_len;
for (int j=0;j<2;j++) {
// First run L states to find initial state for all sub-blocks after first
if (j==0) {
loop_len = win_overlap_len;
} else {
loop_len = long_sb;
}
// When passing through all window pick estimated initial states (known state for sb=0)
if (loop_len == long_sb) {
// shuffle across 128-bit boundary manually
#ifdef WINIMP_IS_AVX16
llr_t tmp[8];
for (int i = 0; i < 8; i++) {
tmp[i] = _mm256_extract_epi16(old[i], 8);
}
#endif
#ifdef WINIMP_IS_AVX8
llr_t tmp[8];
for (int i = 0; i < 8; i++) {
tmp[i] = _mm256_extract_epi8(old[i], 16);
}
#endif
for (int i = 0; i < 8; i++) {
old[i] = simd_shuffle(old[i], move_right);
}
// last sub-block state is calculated from the trellis
llr_t trellis_old[8];
MAKE_FUNC(beta_trellis)(input, parity, long_cb, trellis_old);
for (int i = 0; i < 8; i++) {
old[i] = simd_insert(old[i], trellis_old[i], nof_blocks-1);
}
#ifdef WINIMP_IS_AVX16
for (int i = 0; i < 8; i++) {
old[i] = _mm256_insert_epi16(old[i], tmp[i], 7);
}
#endif
#ifdef WINIMP_IS_AVX8
for (int i = 0; i < 8; i++) {
old[i] = _mm256_insert_epi8(old[i], tmp[i], 15);
}
#endif
inputPtr = (simd_type_t*) &input[long_cb-nof_blocks];
appPtr = (simd_type_t*) &app[long_cb-nof_blocks];
parityPtr = (simd_type_t*) &parity[long_cb-nof_blocks];
for (int i = 0; i < 8; i++) {
simd_store(&betaPtr[8*long_sb + i], old[i]);
}
} else {
// when estimating states, just set all to unknown
for (int i = 0; i < 8; i++) {
old[i] = simd_set1(-INF);
}
inputPtr = (simd_type_t*) &input[nof_blocks*(loop_len-1)];
appPtr = (simd_type_t*) &app[nof_blocks*(loop_len-1)];
parityPtr = (simd_type_t*) &parity[nof_blocks*(loop_len-1)];
}
for (int k = loop_len - 1; k >= 0; k--) {
x = simd_load(inputPtr--);
y = simd_load(parityPtr--);
if (app) {
ap = simd_load(appPtr--);
x = simd_add(ap, x);
}
xy = simd_add(x, y);
m_b[0] = simd_add(old[4], xy);
m_b[1] = old[4];
m_b[2] = simd_add(old[5], y);
m_b[3] = simd_add(old[5], x);
m_b[4] = simd_add(old[6], x);
m_b[5] = simd_add(old[6], y);
m_b[6] = old[7];
m_b[7] = simd_add(old[7], xy);
new[0] = old[0];
new[1] = simd_add(old[0], xy);
new[2] = simd_add(old[1], x);
new[3] = simd_add(old[1], y);
new[4] = simd_add(old[2], y);
new[5] = simd_add(old[2], x);
new[6] = simd_add(old[3], xy);
new[7] = old[3];
// Calculate maximum metric
for (int i = 0; i < 8; i++) {
old[i] = simd_max(m_b[i], new[i]);
}
// Store metric only when doing the final pass
if (loop_len == long_sb) {
for (int i = 0; i < 8; i++) {
simd_store(&betaPtr[8*k + i], old[i]);
}
}
if (loop_len!=long_sb) {
debug_state_beta(0);
} else {
debug_state_beta(0);
}
// normalize
MAKE_FUNC(normalize)(k, old);
}
}
}
/* Computes alpha metrics */
static void MAKE_FUNC(alpha)(MAKE_TYPE * s, llr_t *input, llr_t *app, llr_t *parity, llr_t * output, uint32_t long_cb)
{
simd_type_t m_b[8], new[8], old[8], max1[8], max0[8];
simd_type_t x, y, xy, ap;
simd_type_t m1, m0;
simd_type_t *inputPtr;
simd_type_t *appPtr;
simd_type_t *parityPtr;
simd_type_t *betaPtr = (simd_type_t*) s->beta;
simd_type_t *outputPtr = (simd_type_t*) output;
#if debug_enabled_win
simd_type_t beta_save[8];
#endif
// Skip state 0
betaPtr+=8;
uint32_t loop_len;
for (int j=0;j<2;j++) {
// First run L states to find initial state for all sub-blocks after first
if (j==0) {
loop_len = win_overlap_len;
} else {
loop_len = long_sb;
}
// When passing through all window pick estimated initial states (known state for sb=0)
if (loop_len == long_sb) {
#ifdef WINIMP_IS_AVX16
llr_t tmp[8];
for (int i=0;i<8;i++) {
tmp[i] = _mm256_extract_epi16(old[i], 7);
}
#endif
#ifdef WINIMP_IS_AVX8
llr_t tmp[8];
for (int i=0;i<8;i++) {
tmp[i] = _mm256_extract_epi8(old[i], 15);
}
#endif
for (int i = 0; i < 8; i++) {
old[i] = simd_shuffle(old[i], move_left);
}
#ifdef WINIMP_IS_AVX16
for (int i=0;i<8;i++) {
old[i] = _mm256_insert_epi16(old[i], tmp[i], 8);
}
#endif
#ifdef WINIMP_IS_AVX8
for (int i=0;i<8;i++) {
old[i] = _mm256_insert_epi8(old[i], tmp[i], 16);
}
#endif
// 1st sub-block state is known
old[0] = simd_insert(old[0], 0, 0);
for (int i = 1; i < 8; i++) {
old[i] = simd_insert(old[i], -INF, 0);
}
} else {
// when estimating states, just set all to unknown
for (int i = 0; i < 8; i++) {
old[i] = simd_set1(-INF);
}
}
inputPtr = (simd_type_t*) &input[nof_blocks*(long_sb-loop_len)];
appPtr = (simd_type_t*) &app[nof_blocks*(long_sb-loop_len)];
parityPtr = (simd_type_t*) &parity[nof_blocks*(long_sb-loop_len)];
for (int k = 0; k < loop_len; k++) {
x = simd_load(inputPtr++);
y = simd_load(parityPtr++);
if (app) {
ap = simd_load(appPtr++);
x = simd_add(ap, x);
}
xy = simd_add(x,y);
m_b[0] = old[0];
m_b[1] = simd_add(old[3], y);
m_b[2] = simd_add(old[4], y);
m_b[3] = old[7];
m_b[4] = old[1];
m_b[5] = simd_add(old[2], y);
m_b[6] = simd_add(old[5], y);
m_b[7] = old[6];
new[0] = simd_add(old[1], xy);
new[1] = simd_add(old[2], x);
new[2] = simd_add(old[5], x);
new[3] = simd_add(old[6], xy);
new[4] = simd_add(old[0], xy);
new[5] = simd_add(old[3], x);
new[6] = simd_add(old[4], x);
new[7] = simd_add(old[7], xy);
// Load beta and compute output only when passing through all window
if (loop_len == long_sb) {
simd_type_t beta;
for (int i = 0; i < 8; i++) {
beta = simd_load(betaPtr++);
max0[i] = simd_add(beta, m_b[i]);
max1[i] = simd_add(beta, new[i]);
#if debug_enabled_win
beta_save[i] = beta;
#endif
}
m1 = simd_max(max1[0], max1[1]);
m0 = simd_max(max0[0], max0[1]);
for (int i = 2; i < 8; i++) {
m1 = simd_max(m1, max1[i]);
m0 = simd_max(m0, max0[i]);
}
simd_type_t out = simd_sub(m1, m0);
// Divide output when using 8-bit arithmetic
#ifdef divide_output
out = simd_rb_shift(out, divide_output);
#endif
simd_store(outputPtr++, out);
debug_state(0);
}
for (int i = 0; i < 8; i++) {
old[i] = simd_max(m_b[i], new[i]);
}
// normalize
MAKE_FUNC(normalize)(k, old);
if (loop_len != long_sb) {
debug_state_pre(0);
}
}
}
}
int MAKE_FUNC(init)(void **hh, uint32_t max_long_cb)
{
*hh = calloc(1, sizeof(MAKE_TYPE));
MAKE_TYPE *h = (MAKE_TYPE*) *hh;
h->beta = srslte_vec_malloc(sizeof(llr_t) * 8 * max_long_cb * nof_blocks);
if (!h->beta) {
perror("srslte_vec_malloc");
return -1;
}
h->max_long_cb = max_long_cb;
return nof_blocks;
}
void MAKE_FUNC(free)(void *hh)
{
MAKE_TYPE *h = (MAKE_TYPE*) hh;
if (h->beta) {
free(h->beta);
}
bzero(h, sizeof(MAKE_TYPE));
}
void MAKE_FUNC(dec)(void *hh, llr_t *input, llr_t *app, llr_t *parity, llr_t *output, uint32_t long_cb)
{
MAKE_TYPE *h = (MAKE_TYPE*) hh;
MAKE_FUNC(beta)(h, input, app, parity, long_cb);
MAKE_FUNC(alpha)(h, input, app, parity, output, long_cb);
#if debug_enabled_win
printf("running win decoder: %s\n", STRING(WINIMP));
#endif
}
#define INSERT8_INPUT(reg, st, off) reg = simd_insert(reg, input[3*(i+(st+0)*long_sb)+off], st+0);\
reg = simd_insert(reg, input[3*(i+(st+1)*long_sb)+off], st+1);\
reg = simd_insert(reg, input[3*(i+(st+2)*long_sb)+off], st+2);\
reg = simd_insert(reg, input[3*(i+(st+3)*long_sb)+off], st+3);\
reg = simd_insert(reg, input[3*(i+(st+4)*long_sb)+off], st+4);\
reg = simd_insert(reg, input[3*(i+(st+5)*long_sb)+off], st+5);\
reg = simd_insert(reg, input[3*(i+(st+6)*long_sb)+off], st+6);\
reg = simd_insert(reg, input[3*(i+(st+7)*long_sb)+off], st+7);
void MAKE_FUNC(extract_input)(llr_t *input, llr_t *systematic, llr_t *app2, llr_t *parity_0, llr_t *parity_1, uint32_t long_cb)
{
simd_type_t *systPtr = (simd_type_t*) systematic;
simd_type_t *parity0Ptr = (simd_type_t*) parity_0;
simd_type_t *parity1Ptr = (simd_type_t*) parity_1;
simd_type_t syst, parity0, parity1;
for (int i=0;i<long_sb;i++) {
INSERT8_INPUT(syst, 0, 0);
INSERT8_INPUT(parity0, 0, 1);
INSERT8_INPUT(parity1, 0, 2);
#if nof_blocks >= 16
INSERT8_INPUT(syst, 8, 0);
INSERT8_INPUT(parity0, 8, 1);
INSERT8_INPUT(parity1, 8, 2);
#endif
#if nof_blocks >= 32
INSERT8_INPUT(syst, 16, 0);
INSERT8_INPUT(parity0, 16, 1);
INSERT8_INPUT(parity1, 16, 2);
INSERT8_INPUT(syst, 24, 0);
INSERT8_INPUT(parity0, 24, 1);
INSERT8_INPUT(parity1, 24, 2);
#endif
simd_store(systPtr++, syst);
simd_store(parity0Ptr++, parity0);
simd_store(parity1Ptr++, parity1);
}
for (int i = long_cb; i < long_cb + 3; i++) {
systematic[i] = input[3*long_cb + 2*(i - long_cb)];
parity_0[i] = input[3*long_cb + 2*(i - long_cb) + 1];
app2[i] = input[3*long_cb + 6 + 2*(i - long_cb)];
parity_1[i] = input[3*long_cb + 6 + 2*(i - long_cb) + 1];
}
}
#define deinter(x,win) ((x%(long_cb/win))*(win)+x/(long_cb/win))
#define reset_cnt(a,b) if(!((a+1)%b)) { \
k+=b*nof_blocks; \
if (k >= long_cb) { \
k -= (long_cb-1);\
}\
}
#define insert_bit(a,b) ap = _mm_insert_epi16(ap, app1[k+(a%b)*nof_blocks], 7-a); \
reset_cnt(a,b); \
#define decide_for(b) for (uint32_t i = 0; i < long_cb/8; i++) { \
insert_bit(0,b);\
insert_bit(1,b);\
insert_bit(2,b);\
insert_bit(3,b);\
insert_bit(4,b);\
insert_bit(5,b);\
insert_bit(6,b);\
insert_bit(7,b);\
output[i] = (uint8_t) _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_packs_epi16(ap,zeros),zeros));\
}
/* No improvement to use AVX here */
void MAKE_FUNC(decision_byte)(llr_t *app1, uint8_t *output, uint32_t long_cb)
{
uint32_t k=0;
__m128i zeros = _mm_setzero_si128();
__m128i ap;
if ((long_cb%(nof_blocks*8)) == 0) {
decide_for(8);
} else if ((long_cb%(nof_blocks*4)) == 0) {
decide_for(4);
} else if ((long_cb%(nof_blocks*2)) == 0) {
decide_for(2);
} else {
decide_for(1);
}
}
#undef WINIMP
#undef nof_blocks
#undef llr_t
#undef normalize_period
#undef INF
#undef win_overlap_len
#undef simd_type_t
#undef simd_load
#undef simd_store
#undef simd_add
#undef simd_sub
#undef simd_max
#undef simd_set1
#undef simd_insert
#undef simd_shuffle
#undef move_right
#undef move_left
#undef debug_enabled_win
#ifdef normalize_max
#undef normalize_max
#endif
#ifdef use_saturated_add
#undef use_saturated_add
#endif
#ifdef simd_rb_shift
#undef simd_rb_shift
#endif
#ifdef divide_output
#undef divide_output
#endif

@ -53,4 +53,9 @@ SRSLTE_API int srslte_demod_soft_demodulate_s(srslte_mod_t modulation,
short* llr, short* llr,
int nsymbols); int nsymbols);
SRSLTE_API int srslte_demod_soft_demodulate_b(srslte_mod_t modulation,
const cf_t* symbols,
int8_t* llr,
int nsymbols);
#endif // SRSLTE_DEMOD_SOFT_H #endif // SRSLTE_DEMOD_SOFT_H

@ -65,6 +65,8 @@ typedef struct SRSLTE_API {
uint16_t ue_rnti; uint16_t ue_rnti;
bool is_ue; bool is_ue;
bool llr_is_8bit;
/* Power allocation parameter 3GPP 36.213 Clause 5.2 Rho_b */ /* Power allocation parameter 3GPP 36.213 Clause 5.2 Rho_b */
float rho_a; float rho_a;

@ -74,6 +74,8 @@ typedef struct SRSLTE_API {
uint16_t ue_rnti; uint16_t ue_rnti;
uint32_t max_re; uint32_t max_re;
bool llr_is_8bit;
srslte_dft_precoding_t dft_precoding; srslte_dft_precoding_t dft_precoding;
/* buffers */ /* buffers */

@ -59,6 +59,8 @@ typedef struct SRSLTE_API {
uint32_t max_iterations; uint32_t max_iterations;
uint32_t nof_iterations; uint32_t nof_iterations;
bool llr_is_8bit;
/* buffers */ /* buffers */
uint8_t *cb_in; uint8_t *cb_in;
uint8_t *parity_bits; uint8_t *parity_bits;

@ -68,6 +68,11 @@ SRSLTE_API void srslte_scrambling_s_offset(srslte_sequence_t *s,
int offset, int offset,
int len); int len);
SRSLTE_API void srslte_scrambling_sb_offset(srslte_sequence_t *s,
int8_t *data,
int offset,
int len);
SRSLTE_API void srslte_scrambling_c(srslte_sequence_t *s, SRSLTE_API void srslte_scrambling_c(srslte_sequence_t *s,
cf_t *data); cf_t *data);

@ -100,6 +100,7 @@
#define SRSLTE_SIMD_I_SIZE 16 #define SRSLTE_SIMD_I_SIZE 16
#define SRSLTE_SIMD_B_SIZE 64
#define SRSLTE_SIMD_S_SIZE 32 #define SRSLTE_SIMD_S_SIZE 32
#define SRSLTE_SIMD_C16_SIZE 0 #define SRSLTE_SIMD_C16_SIZE 0
@ -111,6 +112,7 @@
#define SRSLTE_SIMD_I_SIZE 8 #define SRSLTE_SIMD_I_SIZE 8
#define SRSLTE_SIMD_B_SIZE 32
#define SRSLTE_SIMD_S_SIZE 16 #define SRSLTE_SIMD_S_SIZE 16
#define SRSLTE_SIMD_C16_SIZE 16 #define SRSLTE_SIMD_C16_SIZE 16
@ -122,6 +124,7 @@
#define SRSLTE_SIMD_I_SIZE 4 #define SRSLTE_SIMD_I_SIZE 4
#define SRSLTE_SIMD_B_SIZE 16
#define SRSLTE_SIMD_S_SIZE 8 #define SRSLTE_SIMD_S_SIZE 8
#define SRSLTE_SIMD_C16_SIZE 8 #define SRSLTE_SIMD_C16_SIZE 8
@ -132,7 +135,7 @@
#define SRSLTE_SIMD_CF_SIZE 4 #define SRSLTE_SIMD_CF_SIZE 4
#define SRSLTE_SIMD_I_SIZE 4 #define SRSLTE_SIMD_I_SIZE 4
#define SRSLTE_SIMD_B_SIZE 16
#define SRSLTE_SIMD_S_SIZE 8 #define SRSLTE_SIMD_S_SIZE 8
#define SRSLTE_SIMD_C16_SIZE 8 #define SRSLTE_SIMD_C16_SIZE 8
@ -141,7 +144,7 @@
#define SRSLTE_SIMD_CF_SIZE 0 #define SRSLTE_SIMD_CF_SIZE 0
#define SRSLTE_SIMD_I_SIZE 0 #define SRSLTE_SIMD_I_SIZE 0
#define SRSLTE_SIMD_B_SIZE 0
#define SRSLTE_SIMD_S_SIZE 0 #define SRSLTE_SIMD_S_SIZE 0
#define SRSLTE_SIMD_C16_SIZE 0 #define SRSLTE_SIMD_C16_SIZE 0
@ -1336,6 +1339,24 @@ static inline simd_s_t srslte_simd_s_mul(simd_s_t a, simd_s_t b) {
#endif /* LV_HAVE_AVX512 */ #endif /* LV_HAVE_AVX512 */
} }
static inline simd_s_t srslte_simd_s_neg(simd_s_t a, simd_s_t b) {
#ifdef LV_HAVE_AVX512
#error sign instruction not available in avx512
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_sign_epi16(a, b);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
return _mm_sign_epi16(a, b);
#else /* LV_HAVE_SSE */
#ifdef HAVE_NEON
#error sign instruction not available in Neon
#endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
static inline simd_s_t srslte_simd_s_add(simd_s_t a, simd_s_t b) { static inline simd_s_t srslte_simd_s_add(simd_s_t a, simd_s_t b) {
#ifdef LV_HAVE_AVX512 #ifdef LV_HAVE_AVX512
return _mm512_add_epi16(a, b); return _mm512_add_epi16(a, b);
@ -1681,7 +1702,7 @@ typedef int8x16_t simd_b_t;
static inline simd_b_t srslte_simd_b_load(int8_t *ptr){ static inline simd_b_t srslte_simd_b_load(const int8_t *ptr){
#ifdef LV_HAVE_AVX512 #ifdef LV_HAVE_AVX512
return _mm512_load_si512(ptr); return _mm512_load_si512(ptr);
#else /* LV_HAVE_AVX512 */ #else /* LV_HAVE_AVX512 */
@ -1699,7 +1720,7 @@ static inline simd_b_t srslte_simd_b_load(int8_t *ptr){
#endif /* LV_HAVE_AVX512 */ #endif /* LV_HAVE_AVX512 */
} }
static inline simd_b_t srslte_simd_b_loadu(int8_t *ptr){ static inline simd_b_t srslte_simd_b_loadu(const int8_t *ptr){
#ifdef LV_HAVE_AVX512 #ifdef LV_HAVE_AVX512
return _mm512_loadu_si512(ptr); return _mm512_loadu_si512(ptr);
#else /* LV_HAVE_AVX512 */ #else /* LV_HAVE_AVX512 */
@ -1773,6 +1794,44 @@ static inline simd_b_t srslte_simd_b_xor(simd_b_t a, simd_b_t b) {
#endif /* LV_HAVE_AVX512 */ #endif /* LV_HAVE_AVX512 */
} }
static inline simd_s_t srslte_simd_b_sub(simd_s_t a, simd_s_t b) {
#ifdef LV_HAVE_AVX512
return _mm512_subs_epi8(a, b);
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_subs_epi8(a, b);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
return _mm_subs_epi8(a, b);
#else /* LV_HAVE_SSE */
#ifdef HAVE_NEON
return vsubqs_s8(a, b);
#endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
static inline simd_s_t srslte_simd_b_neg(simd_b_t a, simd_b_t b) {
#ifdef LV_HAVE_AVX512
#error sign instruction not available in avx512
#else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2
return _mm256_sign_epi8(a, b);
#else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE
return _mm_sign_epi8(a, b);
#else /* LV_HAVE_SSE */
#ifdef HAVE_NEON
#error sign instruction not available in Neon
#endif /* HAVE_NEON */
#endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */
}
#endif /*SRSLTE_SIMD_B_SIZE */ #endif /*SRSLTE_SIMD_B_SIZE */

@ -69,6 +69,7 @@ SRSLTE_API void *srslte_vec_realloc(void *ptr, uint32_t old_size, uint32_t new_s
SRSLTE_API void srslte_vec_fprint_c(FILE *stream, cf_t *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_c(FILE *stream, cf_t *x, const uint32_t len);
SRSLTE_API void srslte_vec_fprint_f(FILE *stream, float *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_f(FILE *stream, float *x, const uint32_t len);
SRSLTE_API void srslte_vec_fprint_b(FILE *stream, uint8_t *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_b(FILE *stream, uint8_t *x, const uint32_t len);
SRSLTE_API void srslte_vec_fprint_bs(FILE *stream, int8_t *x, const uint32_t len);
SRSLTE_API void srslte_vec_fprint_byte(FILE *stream, uint8_t *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_byte(FILE *stream, uint8_t *x, const uint32_t len);
SRSLTE_API void srslte_vec_fprint_i(FILE *stream, int *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_i(FILE *stream, int *x, const uint32_t len);
SRSLTE_API void srslte_vec_fprint_s(FILE *stream, short *x, const uint32_t len); SRSLTE_API void srslte_vec_fprint_s(FILE *stream, short *x, const uint32_t len);
@ -82,12 +83,13 @@ SRSLTE_API void srslte_vec_load_file(char *filename, void *buffer, const uint32_
/* sum two vectors */ /* sum two vectors */
SRSLTE_API void srslte_vec_sum_fff(const float *x, const float *y, float *z, const uint32_t len); SRSLTE_API void srslte_vec_sum_fff(const float *x, const float *y, float *z, const uint32_t len);
SRSLTE_API void srslte_vec_sum_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len); SRSLTE_API void srslte_vec_sum_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len);
SRSLTE_API void srslte_vec_sub_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len);
SRSLTE_API void srslte_vec_sum_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len); SRSLTE_API void srslte_vec_sum_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len);
/* substract two vectors z=x-y */ /* substract two vectors z=x-y */
SRSLTE_API void srslte_vec_sub_fff(const float *x, const float *y, float *z, const uint32_t len); SRSLTE_API void srslte_vec_sub_fff(const float *x, const float *y, float *z, const uint32_t len);
SRSLTE_API void srslte_vec_sub_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len); SRSLTE_API void srslte_vec_sub_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len);
SRSLTE_API void srslte_vec_sub_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len);
SRSLTE_API void srslte_vec_sub_bbb(const int8_t *x, const int8_t *y, int8_t *z, const uint32_t len);
/* scalar product */ /* scalar product */
SRSLTE_API void srslte_vec_sc_prod_cfc(const cf_t *x, const float h, cf_t *z, const uint32_t len); SRSLTE_API void srslte_vec_sc_prod_cfc(const cf_t *x, const float h, cf_t *z, const uint32_t len);
@ -97,8 +99,10 @@ SRSLTE_API void srslte_vec_sc_prod_fff(const float *x, const float h, float *z,
SRSLTE_API void srslte_vec_convert_fi(const float *x, const float scale, int16_t *z, const uint32_t len); SRSLTE_API void srslte_vec_convert_fi(const float *x, const float scale, int16_t *z, const uint32_t len);
SRSLTE_API void srslte_vec_convert_if(const int16_t *x, const float scale, float *z, const uint32_t len); SRSLTE_API void srslte_vec_convert_if(const int16_t *x, const float scale, float *z, const uint32_t len);
SRSLTE_API void srslte_vec_convert_fb(const float *x, const float scale, int8_t *z, const uint32_t len);
SRSLTE_API void srslte_vec_lut_sss(const short *x, const unsigned short *lut, short *y, const uint32_t len); SRSLTE_API void srslte_vec_lut_sss(const short *x, const unsigned short *lut, short *y, const uint32_t len);
SRSLTE_API void srslte_vec_lut_bbb(const int8_t *x, const unsigned short *lut, int8_t *y, const uint32_t len);
SRSLTE_API void srslte_vec_lut_sis(const short *x, const unsigned int *lut, short *y, const uint32_t len); SRSLTE_API void srslte_vec_lut_sis(const short *x, const unsigned int *lut, short *y, const uint32_t len);
/* vector product (element-wise) */ /* vector product (element-wise) */
@ -115,6 +119,10 @@ SRSLTE_API void srslte_vec_prod_conj_ccc(const cf_t *x, const cf_t *y, cf_t *z,
SRSLTE_API void srslte_vec_prod_fff(const float *x, const float *y, float *z, const uint32_t len); SRSLTE_API void srslte_vec_prod_fff(const float *x, const float *y, float *z, const uint32_t len);
SRSLTE_API void srslte_vec_prod_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len); SRSLTE_API void srslte_vec_prod_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len);
// Negate sign (scrambling)
SRSLTE_API void srslte_vec_neg_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len);
SRSLTE_API void srslte_vec_neg_bbb(const int8_t *x, const int8_t *y, int8_t *z, const uint32_t len);
/* Dot-product */ /* Dot-product */
SRSLTE_API cf_t srslte_vec_dot_prod_cfc(const cf_t *x, const float *y, const uint32_t len); SRSLTE_API cf_t srslte_vec_dot_prod_cfc(const cf_t *x, const float *y, const uint32_t len);
SRSLTE_API cf_t srslte_vec_dot_prod_ccc(const cf_t *x, const cf_t *y, const uint32_t len); SRSLTE_API cf_t srslte_vec_dot_prod_ccc(const cf_t *x, const cf_t *y, const uint32_t len);

@ -62,6 +62,8 @@ SRSLTE_API void srslte_vec_sum_sss_simd(const int16_t *x, const int16_t *y, int1
SRSLTE_API void srslte_vec_sub_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, int len); SRSLTE_API void srslte_vec_sub_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, int len);
SRSLTE_API void srslte_vec_sub_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, int len);
SRSLTE_API float srslte_vec_acc_ff_simd(const float *x, int len); SRSLTE_API float srslte_vec_acc_ff_simd(const float *x, int len);
SRSLTE_API cf_t srslte_vec_acc_cc_simd(const cf_t *x, int len); SRSLTE_API cf_t srslte_vec_acc_cc_simd(const cf_t *x, int len);
@ -86,6 +88,10 @@ SRSLTE_API void srslte_vec_prod_ccc_c16_simd(const int16_t *a_re, const int16_t
SRSLTE_API void srslte_vec_prod_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, const int len); SRSLTE_API void srslte_vec_prod_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, const int len);
SRSLTE_API void srslte_vec_neg_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, const int len);
SRSLTE_API void srslte_vec_neg_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, const int len);
SRSLTE_API void srslte_vec_prod_cfc_simd(const cf_t *x, const float *y, cf_t *z, const int len); SRSLTE_API void srslte_vec_prod_cfc_simd(const cf_t *x, const float *y, cf_t *z, const int len);
SRSLTE_API void srslte_vec_prod_fff_simd(const float *x, const float *y, float *z, const int len); SRSLTE_API void srslte_vec_prod_fff_simd(const float *x, const float *y, float *z, const int len);
@ -120,10 +126,14 @@ SRSLTE_API void srslte_vec_abs_square_cf_simd(const cf_t *x, float *z, const int
/* Other Functions */ /* Other Functions */
SRSLTE_API void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len); SRSLTE_API void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len);
SRSLTE_API void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t *y, const int len);
SRSLTE_API void srslte_vec_convert_if_simd(const int16_t *x, float *z, const float scale, const int len); SRSLTE_API void srslte_vec_convert_if_simd(const int16_t *x, float *z, const float scale, const int len);
SRSLTE_API void srslte_vec_convert_fi_simd(const float *x, int16_t *z, const float scale, const int len); SRSLTE_API void srslte_vec_convert_fi_simd(const float *x, int16_t *z, const float scale, const int len);
SRSLTE_API void srslte_vec_convert_fb_simd(const float *x, int8_t *z, const float scale, const int len);
SRSLTE_API void srslte_vec_cp_simd(const cf_t *src, cf_t *dst, int len); SRSLTE_API void srslte_vec_cp_simd(const cf_t *src, cf_t *dst, int len);
SRSLTE_API void srslte_vec_interleave_simd(const cf_t *x, const cf_t *y, cf_t *z, const int len); SRSLTE_API void srslte_vec_interleave_simd(const cf_t *x, const cf_t *y, cf_t *z, const int len);

@ -53,6 +53,7 @@ bool threads_new_rt_cpu(pthread_t *thread, void *(*start_routine) (void*), void
cpu_set_t cpuset; cpu_set_t cpuset;
bool attr_enable = false; bool attr_enable = false;
#ifdef PER_THREAD_PRIO
if (prio_offset >= 0) { if (prio_offset >= 0) {
param.sched_priority = sched_get_priority_max(SCHED_FIFO) - prio_offset; param.sched_priority = sched_get_priority_max(SCHED_FIFO) - prio_offset;
pthread_attr_init(&attr); pthread_attr_init(&attr);
@ -82,6 +83,25 @@ bool threads_new_rt_cpu(pthread_t *thread, void *(*start_routine) (void*), void
} }
attr_enable = true; attr_enable = true;
} else if (prio_offset == -2) { } else if (prio_offset == -2) {
#else
// All threads have normal priority except prio_offset=0,1,2,3,4
if (prio_offset >= 0 && prio_offset < 5) {
param.sched_priority = 50;
pthread_attr_init(&attr);
if (pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED)) {
perror("pthread_attr_setinheritsched");
}
if (pthread_attr_setschedpolicy(&attr, SCHED_FIFO)) {
perror("pthread_attr_setschedpolicy");
}
if (pthread_attr_setschedparam(&attr, &param)) {
perror("pthread_attr_setschedparam");
fprintf(stderr, "Error not enough privileges to set Scheduling priority\n");
}
attr_enable = true;
} else {
#endif
param.sched_priority = 0; param.sched_priority = 0;
pthread_attr_init(&attr); pthread_attr_init(&attr);
if (pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED)) { if (pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED)) {

@ -139,6 +139,7 @@ int srslte_sequence_LTE_pr(srslte_sequence_t *q, uint32_t len, uint32_t seed) {
for (int i=0;i<len;i++) { for (int i=0;i<len;i++) {
q->c_float[i] = (1-2*q->c[i]); q->c_float[i] = (1-2*q->c[i]);
q->c_short[i] = (int16_t) q->c_float[i]; q->c_short[i] = (int16_t) q->c_float[i];
q->c_char[i] = (int8_t) q->c_float[i];;
} }
return SRSLTE_SUCCESS; return SRSLTE_SUCCESS;
} }
@ -164,6 +165,10 @@ int srslte_sequence_init(srslte_sequence_t *q, uint32_t len) {
if (!q->c_short) { if (!q->c_short) {
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
q->c_char = srslte_vec_malloc(len * sizeof(int8_t));
if (!q->c_char) {
return SRSLTE_ERROR;
}
q->max_len = len; q->max_len = len;
} }
return SRSLTE_SUCCESS; return SRSLTE_SUCCESS;
@ -182,6 +187,9 @@ void srslte_sequence_free(srslte_sequence_t *q) {
if (q->c_short) { if (q->c_short) {
free(q->c_short); free(q->c_short);
} }
if (q->c_char) {
free(q->c_char);
}
bzero(q, sizeof(srslte_sequence_t)); bzero(q, sizeof(srslte_sequence_t));
} }

@ -43,13 +43,15 @@
#endif #endif
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#include <smmintrin.h> #include <x86intrin.h>
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
#endif #endif
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
#include <immintrin.h> #include <x86intrin.h>
int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
int srslte_rm_turbo_rx_lut_avx_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
#endif #endif
#define NCOLS 32 #define NCOLS 32
@ -69,9 +71,28 @@ static int k0_vec[SRSLTE_NOF_TC_CB_SIZES][4][2];
static bool rm_turbo_tables_generated = false; static bool rm_turbo_tables_generated = false;
// Store deinterleaver version for sub-block turbo decoder
#if SRSLTE_TDEC_EXPECT_INPUT_SB == 1
// Prepare bit for sub-block decoder processing. These are the nof subblock sizes
#define NOF_DEINTER_TABLE_SB_IDX 3
const static int deinter_table_sb_idx[NOF_DEINTER_TABLE_SB_IDX] = {8, 16, 32};
int deinter_table_idx_from_sb_len(uint32_t nof_subblocks) {
for (int i=0;i<NOF_DEINTER_TABLE_SB_IDX;i++) {
if (deinter_table_sb_idx[i] == nof_subblocks) {
return i;
}
}
if (nof_subblocks != 0) {
fprintf(stderr, "Error number of sub-blocks %d not supported in RM\n", nof_subblocks);
}
return -1;
}
static uint16_t deinterleaver_sb[NOF_DEINTER_TABLE_SB_IDX][192][4][18448];
#endif
static uint16_t temp_table1[3*6176], temp_table2[3*6176]; static uint16_t temp_table1[3*6176], temp_table2[3*6176];
void srslte_rm_turbo_gentable_systematic(uint16_t *table_bits, int k0_vec[4][2], uint32_t nrows, int ndummy) { static void srslte_rm_turbo_gentable_systematic(uint16_t *table_bits, int k0_vec[4][2], uint32_t nrows, int ndummy) {
bool last_is_null=true; bool last_is_null=true;
int k_b=0, buff_idx=0; int k_b=0, buff_idx=0;
@ -96,7 +117,7 @@ void srslte_rm_turbo_gentable_systematic(uint16_t *table_bits, int k0_vec[4][2],
} }
} }
void srslte_rm_turbo_gentable_parity(uint16_t *table_parity, int k0_vec[4][2], int offset, uint16_t nrows, int ndummy) { static void srslte_rm_turbo_gentable_parity(uint16_t *table_parity, int k0_vec[4][2], int offset, uint16_t nrows, int ndummy) {
bool last_is_null=true; bool last_is_null=true;
int k_b=0, buff_idx0=0; int k_b=0, buff_idx0=0;
@ -140,9 +161,7 @@ void srslte_rm_turbo_gentable_parity(uint16_t *table_parity, int k0_vec[4][2], i
} }
} }
static void srslte_rm_turbo_gentable_receive(uint16_t *table, uint32_t cb_len, uint32_t rv_idx)
void srslte_rm_turbo_gentable_receive(uint16_t *table, uint32_t cb_len, uint32_t rv_idx)
{ {
int nrows = (uint32_t) (cb_len / 3 - 1) / NCOLS + 1; int nrows = (uint32_t) (cb_len / 3 - 1) / NCOLS + 1;
@ -217,6 +236,33 @@ void srslte_rm_turbo_gentable_receive(uint16_t *table, uint32_t cb_len, uint32_t
table[i] = temp_table2[temp_table1[i]]; table[i] = temp_table2[temp_table1[i]];
} }
} }
#if SRSLTE_TDEC_EXPECT_INPUT_SB==1
#define inter(x,win) ((x%(long_cb/win))*(win)+x/(long_cb/win))
/* Prepare output for sliding window decoder:
* (0..long_cb-1) bits are systematic
* (long_cb..2*long_cb-1) are parity0
* (2*long_cb..3*long_cb-1) are parity1
* then tail bits
*
* Within each block, bits are interleaved every nof_sb
*/
static void interleave_table_sb(uint16_t *in, uint16_t *out, uint32_t cb_idx, uint32_t nof_sb)
{
int long_cb = srslte_cbsegm_cbsize(cb_idx);
int out_len = 3*long_cb+12;
for (int i=0;i<out_len;i++) {
// Do not change tail bit order
if (in[i] < 3*long_cb) {
// align to 32 bytes (warning: must be same alignment as in rm_turbo.c)
out[i] = (in[i]%3)*(long_cb+32)+inter(in[i]/3,nof_sb);
} else {
out[i] = (in[i]-3*long_cb)+3*(long_cb+32);
}
}
}
#endif
void srslte_rm_turbo_gentables() { void srslte_rm_turbo_gentables() {
if (!rm_turbo_tables_generated) { if (!rm_turbo_tables_generated) {
@ -246,6 +292,13 @@ void srslte_rm_turbo_gentables() {
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
srslte_rm_turbo_gentable_receive(deinterleaver[cb_idx][i], in_len, i); srslte_rm_turbo_gentable_receive(deinterleaver[cb_idx][i], in_len, i);
#if SRSLTE_TDEC_EXPECT_INPUT_SB == 1
for (uint32_t s = 0; s < NOF_DEINTER_TABLE_SB_IDX; s++) {
interleave_table_sb(deinterleaver[cb_idx][i], deinterleaver_sb[s][cb_idx][i], cb_idx,
deinter_table_sb_idx[s]);
}
#endif
} }
} }
} }
@ -322,6 +375,10 @@ int srslte_rm_turbo_tx_lut(uint8_t *w_buff, uint8_t *systematic, uint8_t *parity
} }
} }
int srslte_rm_turbo_rx_lut(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
return srslte_rm_turbo_rx_lut_(input, output, in_len, cb_idx, rv_idx, true);
}
/** /**
* Undoes rate matching for LTE Turbo Coder. Expands rate matched buffer to full size buffer. * Undoes rate matching for LTE Turbo Coder. Expands rate matched buffer to full size buffer.
* *
@ -331,37 +388,93 @@ int srslte_rm_turbo_tx_lut(uint8_t *w_buff, uint8_t *systematic, uint8_t *parity
* @param[in] rv_idx Redundancy Version from DCI control message * @param[in] rv_idx Redundancy Version from DCI control message
* @return Error code * @return Error code
*/ */
int srslte_rm_turbo_rx_lut(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) int srslte_rm_turbo_rx_lut_(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx, bool enable_input_tdec)
{ {
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
#if SRSLTE_TDEC_EXPECT_INPUT_SB == 1
int cb_len=srslte_cbsegm_cbsize(cb_idx);
int idx = deinter_table_idx_from_sb_len(srslte_tdec_autoimp_get_subblocks(cb_len));
uint16_t *deinter = NULL;
if (idx < 0 || !enable_input_tdec) {
deinter = deinterleaver[cb_idx][rv_idx];
} else if (idx < NOF_DEINTER_TABLE_SB_IDX) {
deinter = deinterleaver_sb[idx][cb_idx][rv_idx];
} else {
fprintf(stderr, "Sub-block size index %d not supported in srslte_rm_turbo_rx_lut()\n", idx);
return -1;
}
#else
uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
#endif
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
return srslte_rm_turbo_rx_lut_avx(input, output, in_len, cb_idx, rv_idx); return srslte_rm_turbo_rx_lut_avx(input, output, deinter, in_len, cb_idx, rv_idx);
#else #else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return srslte_rm_turbo_rx_lut_sse(input, output, in_len, cb_idx, rv_idx); return srslte_rm_turbo_rx_lut_sse(input, output, deinter, in_len, cb_idx, rv_idx);
#else #else
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12; uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
for (int i=0;i<in_len;i++) { for (int i=0;i<in_len;i++) {
output[deinter[i%out_len]] += input[i]; output[deinter[i%out_len]] += input[i];
} }
return 0; return 0;
#endif
#endif
} else { } else {
printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx); printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
return SRSLTE_ERROR_INVALID_INPUTS; return SRSLTE_ERROR_INVALID_INPUTS;
} }
}
int srslte_rm_turbo_rx_lut_8bit(int8_t *input, int8_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
#if SRSLTE_TDEC_EXPECT_INPUT_SB == 1
int cb_len=srslte_cbsegm_cbsize(cb_idx);
int idx = deinter_table_idx_from_sb_len(srslte_tdec_autoimp_get_subblocks_8bit(cb_len));
uint16_t *deinter = NULL;
if (idx < 0) {
deinter = deinterleaver[cb_idx][rv_idx];
} else if (idx < NOF_DEINTER_TABLE_SB_IDX) {
deinter = deinterleaver_sb[idx][cb_idx][rv_idx];
} else {
fprintf(stderr, "Sub-block size index %d not supported in srslte_rm_turbo_rx_lut()\n", idx);
return -1;
}
#else
uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
#endif #endif
// FIXME: AVX version of rm_turbo_rx_lut not working
// Warning: Need to check if 8-bit sse version is correct
#ifdef LV_HAVE_SSE
return srslte_rm_turbo_rx_lut_sse_8bit(input, output, deinter, in_len, cb_idx, rv_idx);
#else
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
for (int i=0;i<in_len;i++) {
output[deinter[i%out_len]] += input[i];
}
return 0;
#endif #endif
} else {
printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
return SRSLTE_ERROR_INVALID_INPUTS;
}
} }
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{ {
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) { if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12; uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
const __m128i* xPtr = (const __m128i*) input; const __m128i* xPtr = (const __m128i*) input;
const __m128i* lutPtr = (const __m128i*) deinter; const __m128i* lutPtr = (const __m128i*) deinter;
@ -427,6 +540,97 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len,
} }
} }
int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
const __m128i* xPtr = (const __m128i*) input;
const __m128i* lutPtr = (const __m128i*) deinter;
__m128i xVal, lutVal1, lutVal2;
/* Simplify load if we do not need to wrap (ie high rates) */
if (in_len <= out_len) {
for (int i=0;i<in_len/16;i++) {
xVal = _mm_loadu_si128(xPtr);
xPtr ++;
lutVal1 = _mm_loadu_si128(lutPtr);
lutPtr++;
lutVal2 = _mm_loadu_si128(lutPtr);
lutPtr ++;
for (int j=0;j<8;j++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, j);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j);
output[l] += x;
}
for (int j=0;j<8;j++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
output[l] += x;
}
}
for (int i=16*(in_len/16);i<in_len;i++) {
output[deinter[i%out_len]] += input[i];
}
} else {
int intCnt = 16;
int inputCnt = 0;
int nwrapps = 0;
while(inputCnt < in_len - 16) {
xVal = _mm_loadu_si128(xPtr);
xPtr ++;
lutVal1 = _mm_loadu_si128(lutPtr);
lutPtr++;
lutVal2 = _mm_loadu_si128(lutPtr);
lutPtr ++;
for (int j=0;j<8;j++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, j);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j);
output[l] += x;
}
for (int j=0;j<8;j++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
output[l] += x;
}
intCnt += 16;
inputCnt += 16;
if (intCnt >= out_len && inputCnt < in_len - 16) {
/* Copy last elements */
if ((out_len%16) == 12) {
for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) {
output[deinter[j%out_len]] += input[j];
inputCnt++;
}
} else {
for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) {
output[deinter[j%out_len]] += input[j];
inputCnt++;
}
}
/* And wrap pointers */
nwrapps++;
intCnt = 16;
xPtr = (const __m128i*) &input[nwrapps*out_len];
lutPtr = (const __m128i*) deinter;
}
}
for (int i=inputCnt;i<in_len;i++) {
output[deinter[i%out_len]] += input[i];
}
}
return 0;
} else {
printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
return SRSLTE_ERROR_INVALID_INPUTS;
}
}
#endif #endif
@ -436,12 +640,10 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len,
l = (uint16_t) _mm256_extract_epi16(lutVal, j);\ l = (uint16_t) _mm256_extract_epi16(lutVal, j);\
output[l] += x; output[l] += x;
int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{ {
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) { if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12; uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
const __m256i* xPtr = (const __m256i*) input; const __m256i* xPtr = (const __m256i*) input;
const __m256i* lutPtr = (const __m256i*) deinter; const __m256i* lutPtr = (const __m256i*) deinter;
@ -539,6 +741,155 @@ int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint32_t in_len,
} }
} }
#define SAVE_OUTPUT8(j) x = (int8_t) _mm256_extract_epi8(xVal, j);\
l = (uint16_t) _mm256_extract_epi16(lutVal1, j);\
output[l] += x;
#define SAVE_OUTPUT8_2(j) x = (int8_t) _mm256_extract_epi8(xVal, j+8);\
l = (uint16_t) _mm256_extract_epi16(lutVal2, j);\
output[l] += x;
int srslte_rm_turbo_rx_lut_avx_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
const __m256i* xPtr = (const __m256i*) input;
const __m256i* lutPtr = (const __m256i*) deinter;
__m256i xVal, lutVal1, lutVal2;
int8_t x;
uint16_t l;
/* Simplify load if we do not need to wrap (ie high rates) */
if (in_len <= out_len) {
for (int i=0;i<in_len/32;i++) {
xVal = _mm256_loadu_si256(xPtr);
xPtr ++;
lutVal1 = _mm256_loadu_si256(lutPtr);
lutPtr ++;
lutVal2 = _mm256_loadu_si256(lutPtr);
lutPtr ++;
SAVE_OUTPUT8(0);
SAVE_OUTPUT8(1);
SAVE_OUTPUT8(2);
SAVE_OUTPUT8(3);
SAVE_OUTPUT8(4);
SAVE_OUTPUT8(5);
SAVE_OUTPUT8(6);
SAVE_OUTPUT8(7);
SAVE_OUTPUT8(8);
SAVE_OUTPUT8(9);
SAVE_OUTPUT8(10);
SAVE_OUTPUT8(11);
SAVE_OUTPUT8(12);
SAVE_OUTPUT8(13);
SAVE_OUTPUT8(14);
SAVE_OUTPUT8(15);
SAVE_OUTPUT8_2(0);
SAVE_OUTPUT8_2(1);
SAVE_OUTPUT8_2(2);
SAVE_OUTPUT8_2(3);
SAVE_OUTPUT8_2(4);
SAVE_OUTPUT8_2(5);
SAVE_OUTPUT8_2(6);
SAVE_OUTPUT8_2(7);
SAVE_OUTPUT8_2(8);
SAVE_OUTPUT8_2(9);
SAVE_OUTPUT8_2(10);
SAVE_OUTPUT8_2(11);
SAVE_OUTPUT8_2(12);
SAVE_OUTPUT8_2(13);
SAVE_OUTPUT8_2(14);
SAVE_OUTPUT8_2(15);
}
for (int i=32*(in_len/32);i<in_len;i++) {
output[deinter[i%out_len]] += input[i];
}
} else {
printf("wraps not implemented!\n");
#ifdef kk
int intCnt = 32;
int inputCnt = 0;
int nwrapps = 0;
while(inputCnt < in_len - 32) {
xVal = _mm256_loadu_si256(xPtr);
xPtr ++;
lutVal1 = _mm256_loadu_si256(lutPtr);
lutPtr ++;
lutVal2 = _mm256_loadu_si256(lutPtr);
lutPtr ++;
SAVE_OUTPUT8(0);
SAVE_OUTPUT8(1);
SAVE_OUTPUT8(2);
SAVE_OUTPUT8(3);
SAVE_OUTPUT8(4);
SAVE_OUTPUT8(5);
SAVE_OUTPUT8(6);
SAVE_OUTPUT8(7);
SAVE_OUTPUT8(8);
SAVE_OUTPUT8(9);
SAVE_OUTPUT8(10);
SAVE_OUTPUT8(11);
SAVE_OUTPUT8(12);
SAVE_OUTPUT8(13);
SAVE_OUTPUT8(14);
SAVE_OUTPUT8(15);
SAVE_OUTPUT8_2(0);
SAVE_OUTPUT8_2(1);
SAVE_OUTPUT8_2(2);
SAVE_OUTPUT8_2(3);
SAVE_OUTPUT8_2(4);
SAVE_OUTPUT8_2(5);
SAVE_OUTPUT8_2(6);
SAVE_OUTPUT8_2(7);
SAVE_OUTPUT8_2(8);
SAVE_OUTPUT8_2(9);
SAVE_OUTPUT8_2(10);
SAVE_OUTPUT8_2(11);
SAVE_OUTPUT8_2(12);
SAVE_OUTPUT8_2(13);
SAVE_OUTPUT8_2(14);
SAVE_OUTPUT8_2(15);
intCnt += 32;
inputCnt += 32;
if (intCnt >= out_len && inputCnt < in_len - 32) {
printf("warning rate matching wrapping remainder %d\n", out_len%32);
/* Copy last elements */
for (int j=(nwrapps+1)*out_len-(out_len%32) ;j<(nwrapps+1)*out_len;j++) {
output[deinter[j%out_len]] += input[j];
inputCnt++;
}
/* And wrap pointers */
nwrapps++;
intCnt = 32;
xPtr = (const __m256i*) &input[nwrapps*out_len];
lutPtr = (const __m256i*) deinter;
}
}
for (int i=inputCnt;i<in_len;i++) {
output[deinter[i%out_len]] += input[i];
}
#endif
}
return 0;
} else {
printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
return SRSLTE_ERROR_INVALID_INPUTS;
}
}
#endif #endif

@ -66,7 +66,16 @@ const uint32_t f2_list[SRSLTE_NOF_TC_CB_SIZES] = { 10, 12, 42, 16, 18, 20, 22, 2
280, 142, 480, 146, 444, 120, 152, 462, 234, 158, 80, 96, 902, 166, 336, 280, 142, 480, 146, 444, 120, 152, 462, 234, 158, 80, 96, 902, 166, 336,
170, 86, 174, 176, 178, 120, 182, 184, 186, 94, 190, 480 }; 170, 86, 174, 176, 178, 120, 182, 184, 186, 94, 190, 480 };
int srslte_tc_interl_LTE_gen(srslte_tc_interl_t *h, uint32_t long_cb) { int srslte_tc_interl_LTE_gen(srslte_tc_interl_t *h, uint32_t long_cb)
{
return srslte_tc_interl_LTE_gen_interl(h, long_cb, 1);
}
#define deinter(x,win) ((x%(long_cb/win))*(win)+x/(long_cb/win))
#define inter(x,win) ((x%win)*(long_cb/win)+x/win)
int srslte_tc_interl_LTE_gen_interl(srslte_tc_interl_t *h, uint32_t long_cb, uint32_t interl_win) {
uint32_t cb_table_idx, f1, f2; uint32_t cb_table_idx, f1, f2;
uint64_t i, j; uint64_t i, j;
@ -92,6 +101,19 @@ int srslte_tc_interl_LTE_gen(srslte_tc_interl_t *h, uint32_t long_cb) {
h->forward[i] = (uint32_t) j; h->forward[i] = (uint32_t) j;
h->reverse[j] = (uint32_t) i; h->reverse[j] = (uint32_t) i;
} }
if (interl_win != 1) {
uint16_t *f = malloc(long_cb*sizeof(uint16_t));
uint16_t *r = malloc(long_cb*sizeof(uint16_t));
memcpy(f, h->forward, long_cb*sizeof(uint16_t));
memcpy(r, h->reverse, long_cb*sizeof(uint16_t));
for (i = 0; i < long_cb; i++) {
h->forward[i] = deinter(f[inter(i,interl_win)],interl_win);
h->reverse[i] = deinter(r[inter(i,interl_win)],interl_win);
}
free(f);
free(r);
}
return 0; return 0;
} }

@ -30,7 +30,6 @@
#include <stdint.h> #include <stdint.h>
#include "srslte/phy/fec/tc_interl.h" #include "srslte/phy/fec/tc_interl.h"
#include "srslte/phy/fec/turbocoder.h"
#define TURBO_SRSLTE_TCOD_RATE 3 #define TURBO_SRSLTE_TCOD_RATE 3

@ -183,7 +183,7 @@ int main(int argc, char **argv) {
srslte_rm_turbo_rx(buff_f, BUFFSZ, rm_bits_f, nof_e_bits, bits_f, long_cb_enc, rv_idx, 0); srslte_rm_turbo_rx(buff_f, BUFFSZ, rm_bits_f, nof_e_bits, bits_f, long_cb_enc, rv_idx, 0);
bzero(bits2_s, long_cb_enc*sizeof(short)); bzero(bits2_s, long_cb_enc*sizeof(short));
srslte_rm_turbo_rx_lut(rm_bits_s, bits2_s, nof_e_bits, cb_idx, rv_idx); srslte_rm_turbo_rx_lut_(rm_bits_s, bits2_s, nof_e_bits, cb_idx, rv_idx, false);
for (int i=0;i<long_cb_enc;i++) { for (int i=0;i<long_cb_enc;i++) {
if (bits_f[i] != bits2_s[i]) { if (bits_f[i] != bits2_s[i]) {

@ -96,8 +96,15 @@ int main(int argc, char **argv) {
} }
} }
/* Create CRC for Transport Block, it is not currently used but it is required */
srslte_crc_t crc_tb = {};
if (srslte_crc_init(&crc_tb, SRSLTE_LTE_CRC24A, 24)) {
printf("error initialising CRC\n");
exit(-1);
}
srslte_tcod_encode(&tcod, input_bits, output_bits, long_cb); srslte_tcod_encode(&tcod, input_bits, output_bits, long_cb);
srslte_tcod_encode_lut(&tcod, NULL, input_bytes, parity, len); srslte_tcod_encode_lut(&tcod, &crc_tb, NULL, input_bytes, parity, len, false);
srslte_bit_unpack_vector(parity, parity_bits, 2*(long_cb+4)); srslte_bit_unpack_vector(parity, parity_bits, 2*(long_cb+4));

@ -52,12 +52,14 @@ int test_known_data = 0;
int test_errors = 0; int test_errors = 0;
int nof_repetitions = 1; int nof_repetitions = 1;
srslte_tdec_impl_type_t tdec_type;
#define SNR_POINTS 4 #define SNR_POINTS 4
#define SNR_MIN 1.0 #define SNR_MIN 1.0
#define SNR_MAX 8.0 #define SNR_MAX 8.0
void usage(char *prog) { void usage(char *prog) {
printf("Usage: %s [nlesv]\n", prog); printf("Usage: %s [kcinNledts]\n", prog);
printf( printf(
"\t-k Test with known data (ignores frame_length) [Default disabled]\n"); "\t-k Test with known data (ignores frame_length) [Default disabled]\n");
printf("\t-c nof_cb in parallel [Default %d]\n", nof_cb); printf("\t-c nof_cb in parallel [Default %d]\n", nof_cb);
@ -66,13 +68,14 @@ void usage(char *prog) {
printf("\t-N nof_repetitions [Default %d]\n", nof_repetitions); printf("\t-N nof_repetitions [Default %d]\n", nof_repetitions);
printf("\t-l frame_length [Default %d]\n", frame_length); printf("\t-l frame_length [Default %d]\n", frame_length);
printf("\t-e ebno in dB [Default scan]\n"); printf("\t-e ebno in dB [Default scan]\n");
printf("\t-d Decoder implementation type: 0: Generic, 1: SSE, 2: SSE-window\n");
printf("\t-t test: check errors on exit [Default disabled]\n"); printf("\t-t test: check errors on exit [Default disabled]\n");
printf("\t-s seed [Default 0=time]\n"); printf("\t-s seed [Default 0=time]\n");
} }
void parse_args(int argc, char **argv) { void parse_args(int argc, char **argv) {
int opt; int opt;
while ((opt = getopt(argc, argv, "cinNlstvekt")) != -1) { while ((opt = getopt(argc, argv, "kcinNledts")) != -1) {
switch (opt) { switch (opt) {
case 'c': case 'c':
nof_cb = atoi(argv[optind]); nof_cb = atoi(argv[optind]);
@ -95,6 +98,9 @@ void parse_args(int argc, char **argv) {
case 'l': case 'l':
frame_length = atoi(argv[optind]); frame_length = atoi(argv[optind]);
break; break;
case 'd':
tdec_type = (srslte_tdec_impl_type_t) atoi(argv[optind]);
break;
case 'e': case 'e':
ebno_db = atof(argv[optind]); ebno_db = atof(argv[optind]);
break; break;
@ -117,7 +123,7 @@ int main(int argc, char **argv) {
float *llr; float *llr;
short *llr_s; short *llr_s;
uint8_t *llr_c; uint8_t *llr_c;
uint8_t *data_tx, *data_rx, *data_rx_bytes[SRSLTE_TDEC_MAX_NPAR], *symbols; uint8_t *data_tx, *data_rx, *data_rx_bytes, *symbols;
uint32_t i, j; uint32_t i, j;
float var[SNR_POINTS]; float var[SNR_POINTS];
uint32_t snr_points; uint32_t snr_points;
@ -159,13 +165,11 @@ int main(int argc, char **argv) {
perror("malloc"); perror("malloc");
exit(-1); exit(-1);
} }
for (int cb=0;cb<SRSLTE_TDEC_MAX_NPAR;cb++) { data_rx_bytes = srslte_vec_malloc(frame_length * sizeof(uint8_t));
data_rx_bytes[cb] = srslte_vec_malloc(frame_length * sizeof(uint8_t)); if (!data_rx_bytes) {
if (!data_rx_bytes[cb]) {
perror("malloc"); perror("malloc");
exit(-1); exit(-1);
} }
}
symbols = srslte_vec_malloc(coded_length * sizeof(uint8_t)); symbols = srslte_vec_malloc(coded_length * sizeof(uint8_t));
if (!symbols) { if (!symbols) {
@ -193,11 +197,13 @@ int main(int argc, char **argv) {
exit(-1); exit(-1);
} }
if (srslte_tdec_init(&tdec, frame_length)) { if (srslte_tdec_init_manual(&tdec, frame_length, tdec_type)) {
fprintf(stderr, "Error initiating Turbo decoder\n"); fprintf(stderr, "Error initiating Turbo decoder\n");
exit(-1); exit(-1);
} }
srslte_tdec_force_not_sb(&tdec);
float ebno_inc, esno_db; float ebno_inc, esno_db;
ebno_inc = (SNR_MAX - SNR_MIN) / SNR_POINTS; ebno_inc = (SNR_MAX - SNR_MIN) / SNR_POINTS;
if (ebno_db == 100.0) { if (ebno_db == 100.0) {
@ -246,7 +252,7 @@ int main(int argc, char **argv) {
} }
/* decoder */ /* decoder */
srslte_tdec_reset(&tdec, frame_length); srslte_tdec_new_cb(&tdec, frame_length);
uint32_t t; uint32_t t;
if (nof_iterations == -1) { if (nof_iterations == -1) {
@ -255,36 +261,21 @@ int main(int argc, char **argv) {
t = nof_iterations; t = nof_iterations;
} }
int16_t *input[SRSLTE_TDEC_MAX_NPAR];
uint8_t *output[SRSLTE_TDEC_MAX_NPAR];
for (int n=0;n<SRSLTE_TDEC_MAX_NPAR;n++) {
if (n < nof_cb) {
input[n] = llr_s;
output[n] = data_rx_bytes[n];
} else {
input[n] = NULL;
output[n] = NULL;
}
}
gettimeofday(&tdata[1], NULL); gettimeofday(&tdata[1], NULL);
for (int k=0;k<nof_repetitions;k++) { for (int k=0;k<nof_repetitions;k++) {
srslte_tdec_run_all_par(&tdec, input, output, t, frame_length); srslte_tdec_run_all(&tdec, llr_s, data_rx_bytes, t, frame_length);
} }
gettimeofday(&tdata[2], NULL); gettimeofday(&tdata[2], NULL);
get_time_interval(tdata); get_time_interval(tdata);
mean_usec = (float) mean_usec * 0.9 + (float) (tdata[0].tv_usec/nof_repetitions) * 0.1; mean_usec = (tdata[0].tv_sec*1e6+tdata[0].tv_usec)/nof_repetitions;
frame_cnt++; frame_cnt++;
uint32_t errors_this = 0; uint32_t errors_this = 0;
for (int cb=0;cb<nof_cb;cb++) { srslte_bit_unpack_vector(data_rx_bytes, data_rx, frame_length);
srslte_bit_unpack_vector(data_rx_bytes[cb], data_rx, frame_length);
errors_this=srslte_bit_diff(data_tx, data_rx, frame_length); errors_this=srslte_bit_diff(data_tx, data_rx, frame_length);
//printf("error[%d]=%d\n", cb, errors_this); //printf("error[%d]=%d\n", cb, errors_this);
errors += errors_this; errors += errors_this;
}
printf("Eb/No: %2.2f %10d/%d ", SNR_MIN + i * ebno_inc, frame_cnt, nof_frames); printf("Eb/No: %2.2f %10d/%d ", SNR_MIN + i * ebno_inc, frame_cnt, nof_frames);
printf("BER: %.2e ", (float) errors / (nof_cb*frame_cnt * frame_length)); printf("BER: %.2e ", (float) errors / (nof_cb*frame_cnt * frame_length));
printf("%3.1f Mbps (%6.2f usec)", (float) (nof_cb*frame_length) / mean_usec, mean_usec); printf("%3.1f Mbps (%6.2f usec)", (float) (nof_cb*frame_length) / mean_usec, mean_usec);
@ -301,10 +292,8 @@ int main(int argc, char **argv) {
} }
} }
for (int cb=0;cb<SRSLTE_TDEC_MAX_NPAR;cb++) { if (data_rx_bytes) {
if (data_rx_bytes[cb]) { free(data_rx_bytes);
free(data_rx_bytes[cb]);
}
} }
free(data_tx); free(data_tx);
free(symbols); free(symbols);

@ -193,7 +193,13 @@ int srslte_tcod_encode(srslte_tcod_t *h, uint8_t *input, uint8_t *output, uint32
} }
/* Expects bytes and produces bytes. The systematic and parity bits are interlaced in the output */ /* Expects bytes and produces bytes. The systematic and parity bits are interlaced in the output */
int srslte_tcod_encode_lut(srslte_tcod_t *h, srslte_crc_t *crc, uint8_t *input, uint8_t *parity, uint32_t cblen_idx) int srslte_tcod_encode_lut(srslte_tcod_t *h,
srslte_crc_t *crc_tb,
srslte_crc_t *crc_cb,
uint8_t *input,
uint8_t *parity,
uint32_t cblen_idx,
bool last_cb)
{ {
if (cblen_idx < 188) { if (cblen_idx < 188) {
uint32_t long_cb = (uint32_t) srslte_cbsegm_cbsize(cblen_idx); uint32_t long_cb = (uint32_t) srslte_cbsegm_cbsize(cblen_idx);
@ -204,20 +210,24 @@ int srslte_tcod_encode_lut(srslte_tcod_t *h, srslte_crc_t *crc, uint8_t *input,
} }
/* Reset CRC */ /* Reset CRC */
if (crc) { if (crc_cb) {
srslte_crc_set_init(crc, 0); srslte_crc_set_init(crc_cb, 0);
} }
/* Parity bits for the 1st constituent encoders */ /* Parity bits for the 1st constituent encoders */
uint8_t state0 = 0; uint8_t state0 = 0;
if (crc) { if (crc_cb) {
int block_size_nocrc = (long_cb - crc_cb->order - ((last_cb) ? crc_tb->order : 0)) / 8;
/* if CRC pointer is given */ /* if CRC pointer is given */
for (int i = 0; i < (long_cb - crc->order) / 8; i++) { for (int i = 0; i < block_size_nocrc; i++) {
uint8_t in = input[i]; uint8_t in = input[i];
/* Put byte in CRC and save latest checksum */ /* Put byte in TB CRC and save latest checksum */
srslte_crc_checksum_put_byte(crc, in); srslte_crc_checksum_put_byte(crc_tb, in);
/* Put byte in CB CRC and save latest checksum */
srslte_crc_checksum_put_byte(crc_cb, in);
/* Run actual encoder */ /* Run actual encoder */
tcod_lut_t l = tcod_lut[state0][in]; tcod_lut_t l = tcod_lut[state0][in];
@ -225,10 +235,27 @@ int srslte_tcod_encode_lut(srslte_tcod_t *h, srslte_crc_t *crc, uint8_t *input,
state0 = l.next_state; state0 = l.next_state;
} }
uint32_t checksum = (uint32_t) srslte_crc_checksum_get(crc); if (last_cb) {
for (int i = 0; i < crc->order / 8; i++) { uint32_t checksum = (uint32_t) srslte_crc_checksum_get(crc_tb);
int mask_shift = 8 * (crc->order / 8 - i - 1); for (int i = 0; i < crc_tb->order / 8; i++) {
int idx = (long_cb - crc->order) / 8 + i; int mask_shift = 8 * (crc_tb->order / 8 - i - 1);
int idx = block_size_nocrc + i;
uint8_t in = (uint8_t) ((checksum >> mask_shift) & 0xff);
/* Put byte in CB CRC and save latest checksum */
srslte_crc_checksum_put_byte(crc_cb, in);
input[idx] = in;
tcod_lut_t l = tcod_lut[state0][in];
parity[idx] = l.output;
state0 = l.next_state;
}
}
uint32_t checksum = (uint32_t) srslte_crc_checksum_get(crc_cb);
for (int i = 0; i < crc_cb->order / 8; i++) {
int mask_shift = 8 * (crc_cb->order / 8 - i - 1);
int idx = (long_cb - crc_cb->order) / 8 + i;
uint8_t in = (uint8_t) ((checksum >> mask_shift) & 0xff); uint8_t in = (uint8_t) ((checksum >> mask_shift) & 0xff);
input[idx] = in; input[idx] = in;
@ -239,11 +266,31 @@ int srslte_tcod_encode_lut(srslte_tcod_t *h, srslte_crc_t *crc, uint8_t *input,
} else { } else {
/* No CRC given */ /* No CRC given */
for (uint32_t i = 0; i < long_cb / 8; i++) { int block_size_nocrc = (long_cb - ((last_cb) ? crc_tb->order : 0)) / 8;
tcod_lut_t l = tcod_lut[state0][input[i]];
for (uint32_t i = 0; i < block_size_nocrc; i++) {
uint8_t in = input[i];
srslte_crc_checksum_put_byte(crc_tb, in);
tcod_lut_t l = tcod_lut[state0][in];
parity[i] = l.output; parity[i] = l.output;
state0 = l.next_state; state0 = l.next_state;
} }
if (last_cb) {
uint32_t checksum = (uint32_t) srslte_crc_checksum_get(crc_tb);
for (int i = 0; i < crc_tb->order / 8; i++) {
int mask_shift = 8 * (crc_tb->order / 8 - i - 1);
int idx = block_size_nocrc + i;
uint8_t in = (uint8_t) ((checksum >> mask_shift) & 0xff);
input[idx] = in;
tcod_lut_t l = tcod_lut[state0][in];
parity[idx] = l.output;
state0 = l.next_state;
}
}
} }
parity[long_cb / 8] = 0; // will put tail here later parity[long_cb / 8] = 0; // will put tail here later

@ -24,151 +24,543 @@
* *
*/ */
#include <stdio.h>
#include <stdint.h> #include <stdint.h>
#include <string.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h>
#include <strings.h>
#include <srslte/srslte.h>
#include "srslte/phy/utils/vector.h"
#include "srslte/phy/fec/turbodecoder.h" #include "srslte/phy/fec/turbodecoder.h"
#define debug_enabled 0
/* Generic (no SSE) implementation */
#include "srslte/phy/fec/turbodecoder_gen.h" #include "srslte/phy/fec/turbodecoder_gen.h"
srslte_tdec_16bit_impl_t gen_impl = {
tdec_gen_init,
tdec_gen_free,
tdec_gen_dec,
tdec_gen_extract_input,
tdec_gen_decision_byte
};
/* SSE no-window implementation */
#include "srslte/phy/fec/turbodecoder_sse.h"
srslte_tdec_16bit_impl_t sse_impl = {
tdec_sse_init,
tdec_sse_free,
tdec_sse_dec,
tdec_sse_extract_input,
tdec_sse_decision_byte
};
/* SSE window implementation */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#include "srslte/phy/fec/turbodecoder_simd.h" #define WINIMP_IS_SSE16
#include "srslte/phy/fec/turbodecoder_win.h"
#undef WINIMP_IS_SSE16
srslte_tdec_16bit_impl_t sse16_win_impl = {
tdec_winsse16_init,
tdec_winsse16_free,
tdec_winsse16_dec,
tdec_winsse16_extract_input,
tdec_winsse16_decision_byte
};
#endif #endif
#include "srslte/phy/utils/vector.h" /* AVX window implementation */
#ifdef LV_HAVE_AVX
#define WINIMP_IS_AVX16
#include "srslte/phy/fec/turbodecoder_win.h"
#undef WINIMP_IS_AVX16
srslte_tdec_16bit_impl_t avx16_win_impl = {
tdec_winavx16_init,
tdec_winavx16_free,
tdec_winavx16_dec,
tdec_winavx16_extract_input,
tdec_winavx16_decision_byte
};
#endif
/* SSE window implementation */
#ifdef LV_HAVE_SSE
#define WINIMP_IS_SSE8
#include "srslte/phy/fec/turbodecoder_win.h"
#undef WINIMP_IS_SSE8
srslte_tdec_8bit_impl_t sse8_win_impl = {
tdec_winsse8_init,
tdec_winsse8_free,
tdec_winsse8_dec,
tdec_winsse8_extract_input,
tdec_winsse8_decision_byte
};
#endif
/* AVX window implementation */
#ifdef LV_HAVE_AVX
#define WINIMP_IS_AVX8
#include "srslte/phy/fec/turbodecoder_win.h"
#undef WINIMP_IS_AVX8
srslte_tdec_8bit_impl_t avx8_win_impl = {
tdec_winavx8_init,
tdec_winavx8_free,
tdec_winavx8_dec,
tdec_winavx8_extract_input,
tdec_winavx8_decision_byte
};
#endif
#define AUTO_16_SSE 0
#define AUTO_16_SSEWIN 1
#define AUTO_16_AVXWIN 2
#define AUTO_8_SSEWIN 0
#define AUTO_8_AVXWIN 1
// Include interfaces for 8 and 16 bit decoder implementations
#define LLR_IS_8BIT
#include "srslte/phy/fec/turbodecoder_iter.h"
#undef LLR_IS_8BIT
#define LLR_IS_16BIT
#include "srslte/phy/fec/turbodecoder_iter.h"
#undef LLR_IS_16BIT
int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) { int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) {
#ifdef LV_HAVE_SSE return srslte_tdec_init_manual(h, max_long_cb, SRSLTE_TDEC_AUTO);
return srslte_tdec_simd_init(&h->tdec_simd, SRSLTE_TDEC_MAX_NPAR, max_long_cb); }
#else
h->input_conv = srslte_vec_malloc(sizeof(float) * (3*max_long_cb+12)); uint32_t interleaver_idx(uint32_t nof_subblocks) {
switch (nof_subblocks) {
case 32:
return 3;
case 16:
return 2;
case 8:
return 1;
case 1:
return 0;
default:
return 0;
}
}
/* Initializes the turbo decoder object */
int srslte_tdec_init_manual(srslte_tdec_t * h, uint32_t max_long_cb, srslte_tdec_impl_type_t dec_type)
{
int ret = -1;
bzero(h, sizeof(srslte_tdec_t));
uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
h->dec_type = dec_type;
// Set manual
switch(dec_type) {
case SRSLTE_TDEC_AUTO:
break;
case SRSLTE_TDEC_SSE:
h->dec16[0] = &sse_impl;
h->current_llr_type = SRSLTE_TDEC_16;
break;
case SRSLTE_TDEC_SSE_WINDOW:
h->dec16[0] = &sse16_win_impl;
h->current_llr_type = SRSLTE_TDEC_16;
break;
case SRSLTE_TDEC_GENERIC:
h->dec16[0] = &gen_impl;
h->current_llr_type = SRSLTE_TDEC_16;
break;
case SRSLTE_TDEC_SSE8_WINDOW:
h->dec8[0] = &sse8_win_impl;
h->current_llr_type = SRSLTE_TDEC_8;
break;
#ifdef LV_HAVE_AVX
case SRSLTE_TDEC_AVX_WINDOW:
h->dec16[0] = &avx16_win_impl;
h->current_llr_type = SRSLTE_TDEC_16;
break;
case SRSLTE_TDEC_AVX8_WINDOW:
h->dec8[0] = &avx8_win_impl;
h->current_llr_type = SRSLTE_TDEC_8;
break;
#endif
default:
fprintf(stderr, "Error decoder %d not supported\n", dec_type);
goto clean_and_exit;
}
h->max_long_cb = max_long_cb;
h->app1 = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->app1) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->app2 = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->app2) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->ext1 = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->ext1) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->ext2 = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->ext2) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->syst0 = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->syst0) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->parity0) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->parity1) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->input_conv = srslte_vec_malloc(sizeof(int16_t) * (len * 3+32*3));
if (!h->input_conv) { if (!h->input_conv) {
perror("malloc"); perror("srslte_vec_malloc");
return -1; goto clean_and_exit;
} }
return srslte_tdec_gen_init(&h->tdec_gen, max_long_cb);
if (dec_type == SRSLTE_TDEC_AUTO) {
h->dec16[AUTO_16_SSE] = &sse_impl;
h->dec16[AUTO_16_SSEWIN] = &sse16_win_impl;
h->dec8[AUTO_8_SSEWIN] = &sse8_win_impl;
#ifdef LV_HAVE_AVX
h->dec16[AUTO_16_AVXWIN] = &avx16_win_impl;
h->dec8[AUTO_8_AVXWIN] = &avx8_win_impl;
#endif #endif
for (int td=0;td<SRSLTE_TDEC_NOF_AUTO_MODES_16;td++) {
if (h->dec16[td]) {
if ((h->nof_blocks16[td] = h->dec16[td]->tdec_init(&h->dec16_hdlr[td], h->max_long_cb))<0) {
goto clean_and_exit;
}
}
}
for (int td=0;td<SRSLTE_TDEC_NOF_AUTO_MODES_8;td++) {
if (h->dec8[td]) {
if ((h->nof_blocks8[td] = h->dec8[td]->tdec_init(&h->dec8_hdlr[td], h->max_long_cb))<0) {
goto clean_and_exit;
}
}
} }
void srslte_tdec_free(srslte_tdec_t * h) { // Compute 1 interleaver for each possible nof_subblocks (1, 8, 16 or 32)
#ifdef LV_HAVE_SSE for (int s=0;s<4;s++) {
srslte_tdec_simd_free(&h->tdec_simd); for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
#else if (srslte_tc_interl_init(&h->interleaver[s][i], srslte_cbsegm_cbsize(i)) < 0) {
goto clean_and_exit;
}
srslte_tc_interl_LTE_gen_interl(&h->interleaver[s][i], srslte_cbsegm_cbsize(i), s?(8<<(s-1)):1);
}
}
} else {
uint32_t nof_subblocks;
if (dec_type < SRSLTE_TDEC_SSE8_WINDOW) {
if ((h->nof_blocks16[0] = h->dec16[0]->tdec_init(&h->dec16_hdlr[0], h->max_long_cb))<0) {
goto clean_and_exit;
}
nof_subblocks = h->nof_blocks16[0];
} else {
if ((h->nof_blocks8[0] = h->dec8[0]->tdec_init(&h->dec8_hdlr[0], h->max_long_cb))<0) {
goto clean_and_exit;
}
nof_subblocks = h->nof_blocks8[0];
}
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
if (srslte_tc_interl_init(&h->interleaver[interleaver_idx(nof_subblocks)][i], srslte_cbsegm_cbsize(i)) < 0) {
goto clean_and_exit;
}
srslte_tc_interl_LTE_gen_interl(&h->interleaver[interleaver_idx(nof_subblocks)][i], srslte_cbsegm_cbsize(i), nof_subblocks);
}
}
h->current_cbidx = -1;
ret = 0;
clean_and_exit:
if (ret == -1) {
srslte_tdec_free(h);
}
return ret;
}
void srslte_tdec_free(srslte_tdec_t * h)
{
if (h->app1) {
free(h->app1);
}
if (h->app2) {
free(h->app2);
}
if (h->ext1) {
free(h->ext1);
}
if (h->ext2) {
free(h->ext2);
}
if (h->syst0) {
free(h->syst0);
}
if (h->parity0) {
free(h->parity0);
}
if (h->parity1) {
free(h->parity1);
}
if (h->input_conv) { if (h->input_conv) {
free(h->input_conv); free(h->input_conv);
} }
srslte_tdec_gen_free(&h->tdec_gen);
#endif
for (int td=0;td<SRSLTE_TDEC_NOF_AUTO_MODES_8;td++) {
if (h->dec8[td] && h->dec8_hdlr[td]) {
h->dec8[td]->tdec_free(h->dec8_hdlr[td]);
}
}
for (int td=0;td<SRSLTE_TDEC_NOF_AUTO_MODES_16;td++) {
if (h->dec16[td] && h->dec16_hdlr[td]) {
h->dec16[td]->tdec_free(h->dec16_hdlr[td]);
}
}
for (int s=0;s<4;s++) {
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
srslte_tc_interl_free(&h->interleaver[s][i]);
}
} }
int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) { bzero(h, sizeof(srslte_tdec_t));
#ifdef LV_HAVE_SSE
return srslte_tdec_simd_reset(&h->tdec_simd, long_cb);
#else
return srslte_tdec_gen_reset(&h->tdec_gen, long_cb);
#endif
} }
int srslte_tdec_reset_cb(srslte_tdec_t * h, uint32_t cb_idx) { void srslte_tdec_force_not_sb(srslte_tdec_t *h) {
#ifdef LV_HAVE_SSE h->force_not_sb = true;
return srslte_tdec_simd_reset_cb(&h->tdec_simd, cb_idx);
#else
return srslte_tdec_gen_reset(&h->tdec_gen, h->tdec_gen.current_cb_len);
#endif
} }
int srslte_tdec_get_nof_iterations_cb(srslte_tdec_t * h, uint32_t cb_idx) static void tdec_decision_byte(srslte_tdec_t * h, uint8_t *output)
{ {
#ifdef LV_HAVE_SSE if (h->current_llr_type == SRSLTE_TDEC_16) {
return srslte_tdec_simd_get_nof_iterations_cb(&h->tdec_simd, cb_idx); h->dec16[h->current_dec]->tdec_decision_byte(!(h->n_iter%2)?h->app1:h->ext1, output, h->current_long_cb);
#else } else {
return h->tdec_gen.n_iter; h->dec8[h->current_dec]->tdec_decision_byte(!(h->n_iter%2)?(int8_t*)h->app1:(int8_t*)h->ext1, output, h->current_long_cb);
#endif }
} }
void srslte_tdec_iteration_par(srslte_tdec_t * h, int16_t* input[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb) {
#ifdef LV_HAVE_SSE /* Returns number of subblocks in automatic mode for this long_cb */
srslte_tdec_simd_iteration(&h->tdec_simd, input, long_cb); uint32_t srslte_tdec_autoimp_get_subblocks(uint32_t long_cb)
#else {
srslte_vec_convert_if(input[0], 0.01, h->input_conv, 3*long_cb+12); #ifdef LV_HAVE_AVX
srslte_tdec_gen_iteration(&h->tdec_gen, h->input_conv, long_cb); if (!(long_cb%16) && long_cb > 800) {
return 16;
} else
#endif #endif
if (!(long_cb%8) && long_cb > 400) {
return 8;
} else {
return 0;
}
} }
void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) { static int tdec_sb_idx(uint32_t long_cb) {
int16_t *input_par[SRSLTE_TDEC_MAX_NPAR]; uint32_t nof_sb = srslte_tdec_autoimp_get_subblocks(long_cb);
input_par[0] = input; switch(nof_sb) {
return srslte_tdec_iteration_par(h, input_par, long_cb); case 16:
return AUTO_16_AVXWIN;
case 8:
return AUTO_16_SSEWIN;
case 0:
return AUTO_16_SSE;
}
fprintf(stderr, "Error in tdec_sb_idx() invalid nof_sb=%d\n", nof_sb);
return 0;
} }
void srslte_tdec_decision_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb) { uint32_t srslte_tdec_autoimp_get_subblocks_8bit(uint32_t long_cb)
#ifdef LV_HAVE_SSE {
return srslte_tdec_simd_decision(&h->tdec_simd, output, long_cb); #ifdef LV_HAVE_AVX
#else if (!(long_cb%32) && long_cb > 2048) {
return srslte_tdec_gen_decision(&h->tdec_gen, output[0], long_cb); return 32;
} else
#endif #endif
if (!(long_cb%16) && long_cb > 800) {
return 16;
} else if (!(long_cb%8) && long_cb > 400) {
return 8;
} else {
return 0;
}
} }
uint32_t srslte_tdec_get_nof_parallel(srslte_tdec_t *h) { static int tdec_sb_idx_8(uint32_t long_cb) {
#ifdef LV_HAVE_AVX2 uint32_t nof_sb = srslte_tdec_autoimp_get_subblocks_8bit(long_cb);
return 2; switch(nof_sb) {
#else case 32:
return 1; return AUTO_8_AVXWIN;
#endif case 16:
return AUTO_8_SSEWIN;
case 8:
return 10+AUTO_16_SSEWIN;
case 0:
return 10+AUTO_16_SSE;
}
fprintf(stderr, "Error in tdec_sb_idx_8() invalid nof_sb=%d\n", nof_sb);
return 0;
} }
void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { // TODO: Implement SSE version. Don't really a problem since this only called at very low rates
uint8_t *output_par[SRSLTE_TDEC_MAX_NPAR]; static void convert_8_to_16(int8_t *in, int16_t *out, uint32_t len)
output_par[0] = output; {
srslte_tdec_decision_par(h, output_par, long_cb); for (int i=0;i<len;i++) {
out[i] = (int16_t) in[i];
}
} }
void srslte_tdec_decision_byte_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb) { static void convert_16_to_8(int16_t *in, int8_t *out, uint32_t len)
#ifdef LV_HAVE_SSE {
srslte_tdec_simd_decision_byte(&h->tdec_simd, output, long_cb); for (int i=0;i<len;i++) {
#else out[i] = (int8_t) in[i];
srslte_tdec_gen_decision_byte(&h->tdec_gen, output[0], long_cb); }
#endif
} }
void srslte_tdec_decision_byte_par_cb(srslte_tdec_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb) { static void tdec_iteration_8(srslte_tdec_t * h, int8_t * input)
#ifdef LV_HAVE_SSE {
srslte_tdec_simd_decision_byte_cb(&h->tdec_simd, output, cb_idx, long_cb); // Select decoder if in auto mode
#else if (h->dec_type == SRSLTE_TDEC_AUTO) {
srslte_tdec_gen_decision_byte(&h->tdec_gen, output, long_cb); h->current_llr_type = SRSLTE_TDEC_8;
#endif h->current_dec = tdec_sb_idx_8(h->current_long_cb);
h->current_inter_idx = interleaver_idx(h->nof_blocks8[h->current_dec]);
// If long_cb is not multiple of any 8-bit decoder, use a 16-bit decoder and do type conversion
if (h->current_dec >= 10) {
h->current_llr_type = SRSLTE_TDEC_16;
h->current_dec -= 10;
h->current_inter_idx = interleaver_idx(h->nof_blocks16[h->current_dec]);
}
} else {
h->current_dec = 0;
} }
void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) { if (h->current_llr_type == SRSLTE_TDEC_16) {
uint8_t *output_par[SRSLTE_TDEC_MAX_NPAR]; if (!h->n_iter) {
output_par[0] = output; convert_8_to_16(input, h->input_conv, 3*h->current_long_cb+12);
srslte_tdec_decision_byte_par(h, output_par, long_cb); }
run_tdec_iteration_16bit(h, h->input_conv);
} else {
run_tdec_iteration_8bit(h, input);
}
} }
int srslte_tdec_run_all_par(srslte_tdec_t * h, int16_t * input[SRSLTE_TDEC_MAX_NPAR], static void tdec_iteration_16(srslte_tdec_t * h, int16_t * input)
uint8_t *output[SRSLTE_TDEC_MAX_NPAR], {
uint32_t nof_iterations, uint32_t long_cb) { // Select decoder if in auto mode
#ifdef LV_HAVE_SSE if (h->dec_type == SRSLTE_TDEC_AUTO) {
return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, long_cb); h->current_llr_type = SRSLTE_TDEC_16;
#else h->current_dec = tdec_sb_idx(h->current_long_cb);
srslte_vec_convert_if(input[0], 0.01, h->input_conv, 3*long_cb+12); } else {
return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output[0], nof_iterations, long_cb); h->current_dec = 0;
#endif }
h->current_inter_idx = interleaver_idx(h->nof_blocks16[h->current_dec]);
if (h->current_llr_type == SRSLTE_TDEC_8) {
h->current_inter_idx = interleaver_idx(h->nof_blocks8[h->current_dec]);
if (!h->n_iter) {
convert_16_to_8(input, h->input_conv, 3*h->current_long_cb+12);
}
run_tdec_iteration_8bit(h, h->input_conv);
} else {
run_tdec_iteration_16bit(h, input);
}
} }
int srslte_tdec_run_all(srslte_tdec_t * h, int16_t * input, uint8_t *output, uint32_t nof_iterations, uint32_t long_cb) /* Resets the decoder and sets the codeblock length */
int srslte_tdec_new_cb(srslte_tdec_t * h, uint32_t long_cb)
{ {
uint8_t *output_par[SRSLTE_TDEC_MAX_NPAR]; if (long_cb > h->max_long_cb) {
output_par[0] = output; fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
int16_t *input_par[SRSLTE_TDEC_MAX_NPAR]; h->max_long_cb);
input_par[0] = input; return -1;
}
return srslte_tdec_run_all_par(h, input_par, output_par, nof_iterations, long_cb); h->n_iter = 0;
h->current_long_cb = long_cb;
h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
if (h->current_cbidx < 0) {
fprintf(stderr, "Invalid CB length %d\n", long_cb);
return -1;
}
return 0;
}
void srslte_tdec_iteration(srslte_tdec_t * h, int16_t * input, uint8_t *output)
{
if (h->current_cbidx >= 0) {
tdec_iteration_16(h, input);
tdec_decision_byte(h, output);
}
}
/* Runs nof_iterations iterations and decides the output bits */
int srslte_tdec_run_all(srslte_tdec_t * h, int16_t * input, uint8_t *output,
uint32_t nof_iterations, uint32_t long_cb)
{
if (srslte_tdec_new_cb(h, long_cb)) {
return SRSLTE_ERROR;
}
do {
tdec_iteration_16(h, input);
} while (h->n_iter < nof_iterations);
tdec_decision_byte(h, output);
return SRSLTE_SUCCESS;
}
void srslte_tdec_iteration_8bit(srslte_tdec_t * h, int8_t * input, uint8_t *output)
{
if (h->current_cbidx >= 0) {
tdec_iteration_8(h, input);
tdec_decision_byte(h, output);
}
}
/* Runs nof_iterations iterations and decides the output bits */
int srslte_tdec_run_all_8bit(srslte_tdec_t * h, int8_t * input, uint8_t *output,
uint32_t nof_iterations, uint32_t long_cb)
{
if (srslte_tdec_new_cb(h, long_cb)) {
return SRSLTE_ERROR;
}
do {
tdec_iteration_8(h, input);
} while (h->n_iter < nof_iterations);
tdec_decision_byte(h, output);
return SRSLTE_SUCCESS;
}
int srslte_tdec_get_nof_iterations(srslte_tdec_t * h)
{
return h->n_iter;
} }

@ -1,475 +0,0 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include "srslte/phy/fec/turbodecoder_simd.h"
#include "srslte/phy/utils/vector.h"
#include <inttypes.h>
#define NUMSTATES 8
#define NINPUTS 2
#define TAIL 3
#define TOTALTAIL 12
#define INF 10000
#define ZERO 0
#ifdef LV_HAVE_AVX2
#include <smmintrin.h>
#include <immintrin.h>
// Number of CB processed in parllel in AVX
#define NCB 2
/*
static void print_256i(__m256i x) {
int16_t *s = (int16_t*) &x;
printf("[%d", s[0]);
for (int i=1;i<16;i++) {
printf(",%d", s[i]);
}
printf("]\n");
}
*/
/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
static inline int16_t hMax0(__m256i masked_value)
{
__m128i tmp1 = _mm256_extractf128_si256(masked_value, 0);
__m128i tmp3 = _mm_minpos_epu16(tmp1);
return (int16_t)(_mm_cvtsi128_si32(tmp3));
}
static inline int16_t hMax1(__m256i masked_value)
{
__m128i tmp1 = _mm256_extractf128_si256(masked_value, 1);
__m128i tmp3 = _mm_minpos_epu16(tmp1);
return (int16_t)(_mm_cvtsi128_si32(tmp3));
}
/* Computes beta values */
void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb)
{
int k;
uint32_t end = long_cb + 3;
const __m256i *alphaPtr = (const __m256i*) s->alpha;
__m256i beta_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
__m256i g, bp, bn, alpha_k;
/* Define the shuffle constant for the positive beta */
__m256i shuf_bp = _mm256_set_epi8(
// 1st CB
15+16, 14+16, // 7
7+16, 6+16, // 3
5+16, 4+16, // 2
13+16, 12+16, // 6
11+16, 10+16, // 5
3+16, 2+16, // 1
1+16, 0+16, // 0
9+16, 8+16, // 4
// 2nd CB
15, 14, // 7
7, 6, // 3
5, 4, // 2
13, 12, // 6
11, 10, // 5
3, 2, // 1
1, 0, // 0
9, 8 // 4
);
/* Define the shuffle constant for the negative beta */
__m256i shuf_bn = _mm256_set_epi8(
7+16, 6+16, // 3
15+16, 14+16, // 7
13+16, 12+16, // 6
5+16, 4+16, // 2
3+16, 2+16, // 1
11+16, 10+16, // 5
9+16, 8+16, // 4
1+16, 0+16, // 0
7, 6, // 3
15, 14, // 7
13, 12, // 6
5, 4, // 2
3, 2, // 1
11, 10, // 5
9, 8, // 4
1, 0 // 0
);
alphaPtr += long_cb-1;
/* Define shuffle for branch costs */
__m256i shuf_g[4];
shuf_g[3] = _mm256_set_epi8(3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16,
3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
shuf_g[2] = _mm256_set_epi8(7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,
7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
shuf_g[1] = _mm256_set_epi8(11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,
11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
shuf_g[0] = _mm256_set_epi8(15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,
15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
/* Define shuffle for beta normalization */
__m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
__m256i gv;
int16_t *b = &s->branch[2*NCB*long_cb-16];
__m256i *gPtr = (__m256i*) b;
/* This defines a beta computation step:
* Adds and substracts the branch metrics to the previous beta step,
* shuffles the states according to the trellis path and selects maximum state
*/
#define BETA_STEP(g) bp = _mm256_add_epi16(beta_k, g);\
bn = _mm256_sub_epi16(beta_k, g);\
bp = _mm256_shuffle_epi8(bp, shuf_bp);\
bn = _mm256_shuffle_epi8(bn, shuf_bn);\
beta_k = _mm256_max_epi16(bp, bn);
/* Loads the alpha metrics from memory and adds them to the temporal bn and bp
* metrics. Then computes horizontal maximum of both metrics and computes difference
*/
#define BETA_STEP_CNT(c,d) g = _mm256_shuffle_epi8(gv, shuf_g[c]);\
BETA_STEP(g)\
alpha_k = _mm256_load_si256(alphaPtr);\
alphaPtr--;\
bp = _mm256_add_epi16(bp, alpha_k);\
bn = _mm256_add_epi16(bn, alpha_k);\
bn = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bn);\
bp = _mm256_sub_epi16(_mm256_set1_epi16(0x7FFF), bp);\
output[0][k-d] = hMax0(bn) - hMax0(bp);\
output[1][k-d] = hMax1(bn) - hMax1(bp);
/* The tail does not require to load alpha or produce outputs. Only update
* beta metrics accordingly */
for (k=end-1; k>=long_cb; k--) {
int16_t g0_1 = s->branch[2*NCB*k];
int16_t g1_1 = s->branch[2*NCB*k+1];
int16_t g0_2 = s->branch[2*NCB*k+6];
int16_t g1_2 = s->branch[2*NCB*k+6+1];
g = _mm256_set_epi16(g1_2, g0_2, g0_2, g1_2, g1_2, g0_2, g0_2, g1_2, g1_1, g0_1, g0_1, g1_1, g1_1, g0_1, g0_1, g1_1);
BETA_STEP(g);
}
/* We inline 2 trelis steps for each normalization */
__m256i norm;
for (; k >= 0; k-=8) {
gv = _mm256_load_si256(gPtr);
gPtr--;
BETA_STEP_CNT(0,0);
BETA_STEP_CNT(1,1);
BETA_STEP_CNT(2,2);
BETA_STEP_CNT(3,3);
norm = _mm256_shuffle_epi8(beta_k, shuf_norm);
beta_k = _mm256_sub_epi16(beta_k, norm);
gv = _mm256_load_si256(gPtr);
gPtr--;
BETA_STEP_CNT(0,4);
BETA_STEP_CNT(1,5);
BETA_STEP_CNT(2,6);
BETA_STEP_CNT(3,7);
norm = _mm256_shuffle_epi8(beta_k, shuf_norm);
beta_k = _mm256_sub_epi16(beta_k, norm);
}
}
/* Computes alpha metrics */
void map_avx_alpha(map_gen_t * s, uint32_t long_cb)
{
uint32_t k;
int16_t *alpha1 = s->alpha;
int16_t *alpha2 = &s->alpha[8];
uint32_t i;
alpha1[0] = 0;
alpha2[0] = 0;
for (i = 1; i < 8; i++) {
alpha1[i] = -INF;
alpha2[i] = -INF;
}
/* Define the shuffle constant for the positive alpha */
__m256i shuf_ap = _mm256_set_epi8(
// 1st CB
31, 30, // 7
25, 24, // 4
23, 22, // 3
17, 16, // 0
29, 28, // 6
27, 26, // 5
21, 20, // 2
19, 18, // 1
// 2nd CB
15, 14, // 7
9, 8, // 4
7, 6, // 3
1, 0, // 0
13, 12, // 6
11, 10, // 5
5, 4, // 2
3, 2 // 1
);
/* Define the shuffle constant for the negative alpha */
__m256i shuf_an = _mm256_set_epi8(
// 1nd CB
29, 28, // 6
27, 26, // 5
21, 20, // 2
19, 18, // 1
31, 30, // 7
25, 24, // 4
23, 22, // 3
17, 16, // 0
// 2nd CB
13, 12, // 6
11, 10, // 5
5, 4, // 2
3, 2, // 1
15, 14, // 7
9, 8, // 4
7, 6, // 3
1, 0 // 0
);
/* Define shuffle for branch costs */
__m256i shuf_g[4];
shuf_g[0] = _mm256_set_epi8(3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,
3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2);
shuf_g[1] = _mm256_set_epi8(7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,
7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6);
shuf_g[2] = _mm256_set_epi8(11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,
11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10);
shuf_g[3] = _mm256_set_epi8(15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,
15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14);
__m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
__m256i* alphaPtr = (__m256i*) s->alpha;
alphaPtr++;
__m256i gv;
__m256i *gPtr = (__m256i*) s->branch;
__m256i g, ap, an;
__m256i alpha_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
/* This defines a alpha computation step:
* Adds and substracts the branch metrics to the previous alpha step,
* shuffles the states according to the trellis path and selects maximum state
*/
#define ALPHA_STEP(c) g = _mm256_shuffle_epi8(gv, shuf_g[c]); \
ap = _mm256_add_epi16(alpha_k, g);\
an = _mm256_sub_epi16(alpha_k, g);\
ap = _mm256_shuffle_epi8(ap, shuf_ap);\
an = _mm256_shuffle_epi8(an, shuf_an);\
alpha_k = _mm256_max_epi16(ap, an);\
_mm256_store_si256(alphaPtr, alpha_k);\
alphaPtr++;\
/* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */
__m256i norm;
for (k = 0; k < long_cb/8; k++) {
gv = _mm256_load_si256(gPtr);
gPtr++;
ALPHA_STEP(0);
ALPHA_STEP(1);
ALPHA_STEP(2);
ALPHA_STEP(3);
norm = _mm256_shuffle_epi8(alpha_k, shuf_norm);
alpha_k = _mm256_sub_epi16(alpha_k, norm);
gv = _mm256_load_si256(gPtr);
gPtr++;
ALPHA_STEP(0);
ALPHA_STEP(1);
ALPHA_STEP(2);
ALPHA_STEP(3);
norm = _mm256_shuffle_epi8(alpha_k, shuf_norm);
alpha_k = _mm256_sub_epi16(alpha_k, norm);
}
}
void map_sse_gamma_single(int16_t *output, int16_t *input, int16_t *app, int16_t *parity)
{
__m128i res00, res10, res01, res11, res0, res1;
__m128i in, ap, pa, g1, g0;
__m128i *inPtr = (__m128i*) input;
__m128i *appPtr = (__m128i*) app;
__m128i *paPtr = (__m128i*) parity;
__m128i *resPtr = (__m128i*) output;
__m128i res00_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
__m128i res10_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
__m128i res01_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
__m128i res11_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
in = _mm_load_si128(inPtr);
inPtr++;
pa = _mm_load_si128(paPtr);
paPtr++;
if (appPtr) {
ap = _mm_load_si128(appPtr);
appPtr++;
in = _mm_add_epi16(ap, in);
}
g1 = _mm_add_epi16(in, pa);
g0 = _mm_sub_epi16(in, pa);
g1 = _mm_srai_epi16(g1, 1);
g0 = _mm_srai_epi16(g0, 1);
res00 = _mm_shuffle_epi8(g0, res00_mask);
res10 = _mm_shuffle_epi8(g0, res10_mask);
res01 = _mm_shuffle_epi8(g1, res01_mask);
res11 = _mm_shuffle_epi8(g1, res11_mask);
res0 = _mm_or_si128(res00, res01);
res1 = _mm_or_si128(res10, res11);
_mm_store_si128(resPtr, res0);
resPtr++;
_mm_store_si128(resPtr, res1);
resPtr++;
}
/* Compute branch metrics (gamma) */
void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb)
{
__m128i res10, res20, res11, res21, res1, res2;
__m256i in, ap, pa, g1, g0;
__m256i *inPtr = (__m256i*) input;
__m256i *appPtr = (__m256i*) app;
__m256i *paPtr = (__m256i*) parity;
__m128i *resPtr = (__m128i*) h->branch;
if (cbidx) {
resPtr++;
}
__m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
__m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
__m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
__m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
for (int i=0;i<long_cb/16;i++) {
in = _mm256_load_si256(inPtr);
inPtr++;
pa = _mm256_load_si256(paPtr);
paPtr++;
if (appPtr) {
ap = _mm256_load_si256(appPtr);
appPtr++;
in = _mm256_add_epi16(ap, in);
}
g0 = _mm256_sub_epi16(in, pa);
g1 = _mm256_add_epi16(in, pa);
g0 = _mm256_srai_epi16(g0, 1);
g1 = _mm256_srai_epi16(g1, 1);
__m128i g0_t = _mm256_extractf128_si256(g0, 0);
__m128i g1_t = _mm256_extractf128_si256(g1, 0);
res10 = _mm_shuffle_epi8(g0_t, res10_mask);
res11 = _mm_shuffle_epi8(g1_t, res11_mask);
res20 = _mm_shuffle_epi8(g0_t, res20_mask);
res21 = _mm_shuffle_epi8(g1_t, res21_mask);
res1 = _mm_or_si128(res10, res11);
res2 = _mm_or_si128(res20, res21);
_mm_store_si128(resPtr, res1);
resPtr++;
resPtr++;
_mm_store_si128(resPtr, res2);
resPtr++;
resPtr++;
g0_t = _mm256_extractf128_si256(g0, 1);
g1_t = _mm256_extractf128_si256(g1, 1);
res10 = _mm_shuffle_epi8(g0_t, res10_mask);
res11 = _mm_shuffle_epi8(g1_t, res11_mask);
res20 = _mm_shuffle_epi8(g0_t, res20_mask);
res21 = _mm_shuffle_epi8(g1_t, res21_mask);
res1 = _mm_or_si128(res10, res11);
res2 = _mm_or_si128(res20, res21);
_mm_store_si128(resPtr, res1);
resPtr++;
resPtr++;
_mm_store_si128(resPtr, res2);
resPtr++;
resPtr++;
}
if (long_cb%16) {
map_sse_gamma_single((int16_t*) resPtr, (int16_t*) inPtr, (int16_t*) appPtr, (int16_t*) paPtr);
}
for (int i=long_cb;i<long_cb+3;i++) {
h->branch[2*i*NCB+cbidx*6] = (input[i] - parity[i])/2;
h->branch[2*i*NCB+cbidx*6+1] = (input[i] + parity[i])/2;
}
}
#endif

@ -39,8 +39,16 @@
#define TAIL 3 #define TAIL 3
#define TOTALTAIL 12 #define TOTALTAIL 12
#define INF 9e4 #define INF 10000
#define ZERO 9e-4
#define debug_enabled 0
#if debug_enabled
#define debug_state printf("k=%5d, in=%5d, pa=%3d, out=%5d, alpha=", k, x, parity[k-1], out); srslte_vec_fprint_s(stdout, alpha, 8); \
printf(", beta="); srslte_vec_fprint_s(stdout, &beta[8*(k)], 8); printf("\n");
#else
#define debug_state
#endif
/************************************************ /************************************************
* *
@ -48,14 +56,13 @@
* Decoder * Decoder
* *
************************************************/ ************************************************/
static void map_gen_beta(srslte_map_gen_vl_t * s, float * input, float * parity, static void map_gen_beta(tdec_gen_t *s, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb)
uint32_t long_cb)
{ {
float m_b[8], new[8], old[8]; int16_t m_b[8], new[8], old[8];
float x, y, xy; int16_t x, y, xy;
int k; int k;
uint32_t end = long_cb + SRSLTE_TCOD_RATE; uint32_t end = long_cb + SRSLTE_TCOD_RATE;
float *beta = s->beta; int16_t *beta = s->beta;
uint32_t i; uint32_t i;
for (i = 0; i < 8; i++) { for (i = 0; i < 8; i++) {
@ -64,6 +71,9 @@ static void map_gen_beta(srslte_map_gen_vl_t * s, float * input, float * parity,
for (k = end - 1; k >= 0; k--) { for (k = end - 1; k >= 0; k--) {
x = input[k]; x = input[k];
if (app && k<long_cb) {
x += app[k];
}
y = parity[k]; y = parity[k];
xy = x + y; xy = x + y;
@ -92,19 +102,25 @@ static void map_gen_beta(srslte_map_gen_vl_t * s, float * input, float * parity,
old[i] = new[i]; old[i] = new[i];
beta[8 * k + i] = old[i]; beta[8 * k + i] = old[i];
} }
if ((k%4)==0 && k < long_cb) {
for (i = 1; i < 8; i++) {
old[i] -= old[0];
}
old[0] = 0;
}
} }
} }
static void map_gen_alpha(srslte_map_gen_vl_t * s, float * input, float * parity, float * output, static void map_gen_alpha(tdec_gen_t *s, int16_t *input, int16_t *app, int16_t *parity, int16_t *output, uint32_t long_cb)
uint32_t long_cb)
{ {
float m_b[8], new[8], old[8], max1[8], max0[8]; int16_t m_b[8], new[8], old[8], max1[8], max0[8];
float m1, m0; int16_t m1, m0;
float x, y, xy; int16_t x, y, xy;
float out; int16_t out;
uint32_t k; uint32_t k;
uint32_t end = long_cb; uint32_t end = long_cb;
float *beta = s->beta; int16_t *beta = s->beta;
uint32_t i; uint32_t i;
old[0] = 0; old[0] = 0;
@ -112,12 +128,23 @@ static void map_gen_alpha(srslte_map_gen_vl_t * s, float * input, float * parity
old[i] = -INF; old[i] = -INF;
} }
#if debug_enabled
int16_t alpha[8];
#endif
for (k = 1; k < end + 1; k++) { for (k = 1; k < end + 1; k++) {
x = input[k - 1]; x = input[k - 1];
if (app) {
x += app[k - 1];
}
y = parity[k - 1]; y = parity[k - 1];
xy = x + y; xy = x + y;
#if debug_enabled
memcpy(alpha, old, sizeof(int16_t)*8);
#endif
m_b[0] = old[0]; m_b[0] = old[0];
m_b[1] = old[3] + y; m_b[1] = old[3] + y;
m_b[2] = old[4] + y; m_b[2] = old[4] + y;
@ -150,251 +177,97 @@ static void map_gen_alpha(srslte_map_gen_vl_t * s, float * input, float * parity
if (max0[i] > m0) if (max0[i] > m0)
m0 = max0[i]; m0 = max0[i];
} }
for (i = 0; i < 8; i++) { for (i = 0; i < 8; i++) {
if (m_b[i] > new[i]) if (m_b[i] > new[i])
new[i] = m_b[i]; new[i] = m_b[i];
old[i] = new[i]; old[i] = new[i]; }
if ((k%4)==0) {
for (i = 1; i < 8; i++) {
old[i] -= old[0];
}
old[0] = 0;
} }
out = m1 - m0; out = m1 - m0;
output[k - 1] = out; output[k - 1] = out;
debug_state;
} }
} }
static int map_gen_init(srslte_map_gen_vl_t * h, int max_long_cb) int tdec_gen_init(void **hh, uint32_t max_long_cb)
{ {
bzero(h, sizeof(srslte_map_gen_vl_t)); *hh = calloc(1, sizeof(tdec_gen_t));
h->beta = srslte_vec_malloc(sizeof(float) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
tdec_gen_t *h = (tdec_gen_t*) *hh;
h->beta = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
if (!h->beta) { if (!h->beta) {
perror("srslte_vec_malloc"); perror("srslte_vec_malloc");
return -1; return -1;
} }
h->max_long_cb = max_long_cb; h->max_long_cb = max_long_cb;
return 0; return 1;
} }
static void map_gen_free(srslte_map_gen_vl_t * h) void tdec_gen_free(void *hh)
{ {
tdec_gen_t *h = (tdec_gen_t*) hh;
if (h->beta) { if (h->beta) {
free(h->beta); free(h->beta);
} }
bzero(h, sizeof(srslte_map_gen_vl_t)); bzero(h, sizeof(tdec_gen_t));
} }
static void map_gen_dec(srslte_map_gen_vl_t * h, float * input, float * parity, float * output, void tdec_gen_dec(void *hh, int16_t *input, int16_t *app, int16_t *parity, int16_t *output, uint32_t long_cb)
uint32_t long_cb)
{ {
uint32_t k; tdec_gen_t *h = (tdec_gen_t*) hh;
h->beta[(long_cb + TAIL) * NUMSTATES] = 0; h->beta[(long_cb + TAIL) * NUMSTATES] = 0;
for (k = 1; k < NUMSTATES; k++) for (uint32_t k = 1; k < NUMSTATES; k++)
h->beta[(long_cb + TAIL) * NUMSTATES + k] = -INF; h->beta[(long_cb + TAIL) * NUMSTATES + k] = -INF;
map_gen_beta(h, input, parity, long_cb); map_gen_beta(h, input, app, parity, long_cb);
map_gen_alpha(h, input, parity, output, long_cb); map_gen_alpha(h, input, app, parity, output, long_cb);
}
/************************************************
*
* TURBO DECODER INTERFACE
*
************************************************/
int srslte_tdec_gen_init(srslte_tdec_gen_t * h, uint32_t max_long_cb)
{
int ret = -1;
bzero(h, sizeof(srslte_tdec_gen_t));
uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
h->max_long_cb = max_long_cb;
h->llr1 = srslte_vec_malloc(sizeof(float) * len);
if (!h->llr1) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->llr2 = srslte_vec_malloc(sizeof(float) * len);
if (!h->llr2) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->w = srslte_vec_malloc(sizeof(float) * len);
if (!h->w) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->syst = srslte_vec_malloc(sizeof(float) * len);
if (!h->syst) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->parity = srslte_vec_malloc(sizeof(float) * len);
if (!h->parity) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
if (map_gen_init(&h->dec, h->max_long_cb)) {
goto clean_and_exit;
}
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
goto clean_and_exit;
}
srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
}
h->current_cbidx = -1;
ret = 0;
clean_and_exit:if (ret == -1) {
srslte_tdec_gen_free(h);
}
return ret;
}
void srslte_tdec_gen_free(srslte_tdec_gen_t * h)
{
if (h->llr1) {
free(h->llr1);
}
if (h->llr2) {
free(h->llr2);
}
if (h->w) {
free(h->w);
}
if (h->syst) {
free(h->syst);
}
if (h->parity) {
free(h->parity);
}
map_gen_free(&h->dec);
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
srslte_tc_interl_free(&h->interleaver[i]);
}
bzero(h, sizeof(srslte_tdec_gen_t));
} }
void srslte_tdec_gen_iteration(srslte_tdec_gen_t * h, float * input, uint32_t long_cb) void tdec_gen_extract_input(int16_t *input, int16_t *syst, int16_t *app2, int16_t *parity0, int16_t *parity1, uint32_t long_cb)
{ {
uint32_t i;
if (h->current_cbidx >= 0) {
uint16_t *inter = h->interleaver[h->current_cbidx].forward;
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
// Prepare systematic and parity bits for MAP DEC #1 // Prepare systematic and parity bits for MAP DEC #1
for (i = 0; i < long_cb; i++) { for (uint32_t i = 0; i < long_cb; i++) {
h->syst[i] = input[SRSLTE_TCOD_RATE * i] + h->w[i]; syst[i] = input[SRSLTE_TCOD_RATE * i];
h->parity[i] = input[SRSLTE_TCOD_RATE * i + 1]; parity0[i] = input[SRSLTE_TCOD_RATE * i + 1];
} parity1[i] = input[SRSLTE_TCOD_RATE * i + 2];
for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) {
h->syst[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb)];
h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb) + 1];
} }
for (uint32_t i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) {
syst[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb)];
parity0[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb) + 1];
// Run MAP DEC #1 app2[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + NINPUTS * (i - long_cb)];
map_gen_dec(&h->dec, h->syst, h->parity, h->llr1, long_cb); parity1[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE
// Prepare systematic and parity bits for MAP DEC #1
for (i = 0; i < long_cb; i++) {
h->syst[i] = h->llr1[inter[i]]
- h->w[inter[i]];
h->parity[i] = input[SRSLTE_TCOD_RATE * i + 2];
}
for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) {
h->syst[i] =
input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + NINPUTS * (i - long_cb)];
h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE
+ NINPUTS * (i - long_cb) + 1]; + NINPUTS * (i - long_cb) + 1];
} }
// Run MAP DEC #2
map_gen_dec(&h->dec, h->syst, h->parity, h->llr2, long_cb);
//printf("llr2=");
//srslte_vec_fprint_f(stdout, h->llr2, long_cb);
// Update a-priori LLR from the last iteration
for (i = 0; i < long_cb; i++) {
h->w[i] += h->llr2[deinter[i]] - h->llr1[i];
}
} else {
fprintf(stderr, "Error CB index not set (call srslte_tdec_gen_reset() first\n");
}
// Increase number of iterations
h->n_iter++;
}
int srslte_tdec_gen_reset(srslte_tdec_gen_t * h, uint32_t long_cb)
{
if (long_cb > h->max_long_cb) {
fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
h->max_long_cb);
return -1;
}
memset(h->w, 0, sizeof(float) * long_cb);
h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
h->current_cb_len = long_cb;
if (h->current_cbidx < 0) {
fprintf(stderr, "Invalid CB length %d\n", long_cb);
return -1;
}
return 0;
} }
void srslte_tdec_gen_decision(srslte_tdec_gen_t * h, uint8_t *output, uint32_t long_cb) void tdec_gen_decision_byte(int16_t *app1, uint8_t *output, uint32_t long_cb)
{ {
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
uint32_t i;
for (i = 0; i < long_cb; i++) {
output[i] = (h->llr2[deinter[i]] > 0) ? 1 : 0;
}
}
void srslte_tdec_gen_decision_byte(srslte_tdec_gen_t * h, uint8_t *output, uint32_t long_cb)
{
uint32_t i;
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1}; uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
// long_cb is always byte aligned // long_cb is always byte aligned
for (i = 0; i < long_cb/8; i++) { for (uint32_t i = 0; i < long_cb/8; i++) {
uint8_t out0 = h->llr2[deinter[8*i+0]]>0?mask[0]:0; uint8_t out0 = app1[8*i+0]>0?mask[0]:0;
uint8_t out1 = h->llr2[deinter[8*i+1]]>0?mask[1]:0; uint8_t out1 = app1[8*i+1]>0?mask[1]:0;
uint8_t out2 = h->llr2[deinter[8*i+2]]>0?mask[2]:0; uint8_t out2 = app1[8*i+2]>0?mask[2]:0;
uint8_t out3 = h->llr2[deinter[8*i+3]]>0?mask[3]:0; uint8_t out3 = app1[8*i+3]>0?mask[3]:0;
uint8_t out4 = h->llr2[deinter[8*i+4]]>0?mask[4]:0; uint8_t out4 = app1[8*i+4]>0?mask[4]:0;
uint8_t out5 = h->llr2[deinter[8*i+5]]>0?mask[5]:0; uint8_t out5 = app1[8*i+5]>0?mask[5]:0;
uint8_t out6 = h->llr2[deinter[8*i+6]]>0?mask[6]:0; uint8_t out6 = app1[8*i+6]>0?mask[6]:0;
uint8_t out7 = h->llr2[deinter[8*i+7]]>0?mask[7]:0; uint8_t out7 = app1[8*i+7]>0?mask[7]:0;
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
} }
} }
int srslte_tdec_gen_run_all(srslte_tdec_gen_t * h, float * input, uint8_t *output,
uint32_t nof_iterations, uint32_t long_cb)
{
uint32_t iter = 0;
if (srslte_tdec_gen_reset(h, long_cb)) {
return SRSLTE_ERROR;
}
do {
srslte_tdec_gen_iteration(h, input, long_cb);
iter++;
} while (iter < nof_iterations);
srslte_tdec_gen_decision_byte(h, output, long_cb);
return SRSLTE_SUCCESS;
}

@ -1,542 +0,0 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include "srslte/phy/fec/turbodecoder_simd.h"
#include "srslte/phy/utils/vector.h"
#include <inttypes.h>
#define NUMSTATES 8
#define NINPUTS 2
#define TAIL 3
#define TOTALTAIL 12
#define INF 10000
#define ZERO 0
#ifdef LV_HAVE_SSE
#include <smmintrin.h>
// Define SSE/AVX implementations
void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb);
void map_sse_alpha(map_gen_t * s, uint32_t long_cb);
void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb);
#ifdef LV_HAVE_AVX2
void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb);
void map_avx_alpha(map_gen_t * s, uint32_t long_cb);
void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb);
#endif
void map_simd_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_MAX_NPAR], uint32_t nof_cb, uint32_t long_cb)
{
if (nof_cb == 1) {
map_sse_beta(s, output[0], long_cb);
}
#ifdef LV_HAVE_AVX2
else if (nof_cb == 2) {
map_avx_beta(s, output, long_cb);
}
#endif
}
void map_simd_alpha(map_gen_t * s, uint32_t nof_cb, uint32_t long_cb)
{
if (nof_cb == 1) {
map_sse_alpha(s, long_cb);
}
#ifdef LV_HAVE_AVX2
else if (nof_cb == 2) {
map_avx_alpha(s, long_cb);
}
#endif
}
void map_simd_gamma(map_gen_t * s, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t nof_cb, uint32_t long_cb)
{
if (nof_cb == 1) {
map_sse_gamma(s, input, app, parity, long_cb);
}
#ifdef LV_HAVE_AVX2
else if (nof_cb == 2) {
map_avx_gamma(s, input, app, parity, cbidx, long_cb);
}
#endif
}
/* Inititalizes constituent decoder object */
int map_simd_init(map_gen_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
{
bzero(h, sizeof(map_gen_t));
h->max_par_cb = max_par_cb;
h->max_long_cb = max_long_cb;
h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb);
if (!h->alpha) {
perror("srslte_vec_malloc");
return -1;
}
h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * h->max_par_cb);
if (!h->branch) {
perror("srslte_vec_malloc");
return -1;
}
return 0;
}
void map_simd_free(map_gen_t * h)
{
if (h->alpha) {
free(h->alpha);
}
if (h->branch) {
free(h->branch);
}
bzero(h, sizeof(map_gen_t));
}
/* Runs one instance of a decoder */
void map_simd_dec(map_gen_t * h, int16_t * input[SRSLTE_TDEC_MAX_NPAR], int16_t *app[SRSLTE_TDEC_MAX_NPAR], int16_t * parity[SRSLTE_TDEC_MAX_NPAR],
int16_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t cb_mask, uint32_t long_cb)
{
uint32_t nof_cb = 1;
int16_t *outptr[SRSLTE_TDEC_MAX_NPAR] = { NULL, NULL };
// Compute branch metrics
switch(cb_mask) {
case 1:
nof_cb = 1;
outptr[0] = output[0];
map_simd_gamma(h, input[0], app?app[0]:NULL, parity[0], 0, 1, long_cb);
break;
case 2:
nof_cb = 1;
outptr[0] = output[1];
map_simd_gamma(h, input[1], app?app[1]:NULL, parity[1], 0, 1, long_cb);
break;
case 3:
nof_cb = 2;
for (int i=0;i<2;i++) {
outptr[i] = output[i];
map_simd_gamma(h, input[i], app?app[i]:NULL, parity[i], i, 2, long_cb);
}
break;
}
// Forward recursion
map_simd_alpha(h, nof_cb, long_cb);
// Backwards recursion + LLR computation
map_simd_beta(h, outptr, nof_cb, long_cb);
}
/* Initializes the turbo decoder object */
int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
{
int ret = -1;
bzero(h, sizeof(srslte_tdec_simd_t));
uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
h->max_long_cb = max_long_cb;
h->max_par_cb = max_par_cb;
for (int i=0;i<h->max_par_cb;i++) {
h->app1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->app1[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->app2[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->app2[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->ext1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->ext1[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->ext2[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->ext2[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->syst[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->syst[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->parity0[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->parity0[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->parity1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
if (!h->parity1[i]) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
}
if (map_simd_init(&h->dec, h->max_par_cb, h->max_long_cb)) {
goto clean_and_exit;
}
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
goto clean_and_exit;
}
srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
}
h->current_cbidx = -1;
h->cb_mask = 0;
ret = 0;
clean_and_exit:if (ret == -1) {
srslte_tdec_simd_free(h);
}
return ret;
}
void srslte_tdec_simd_free(srslte_tdec_simd_t * h)
{
for (int i=0;i<h->max_par_cb;i++) {
if (h->app1[i]) {
free(h->app1[i]);
}
if (h->app2[i]) {
free(h->app2[i]);
}
if (h->ext1[i]) {
free(h->ext1[i]);
}
if (h->ext2[i]) {
free(h->ext2[i]);
}
if (h->syst[i]) {
free(h->syst[i]);
}
if (h->parity0[i]) {
free(h->parity0[i]);
}
if (h->parity1[i]) {
free(h->parity1[i]);
}
}
map_simd_free(&h->dec);
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
srslte_tc_interl_free(&h->interleaver[i]);
}
bzero(h, sizeof(srslte_tdec_simd_t));
}
/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into
* 3 buffers ready to be used by compute_gamma()
*/
void deinterleave_input_simd(srslte_tdec_simd_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb) {
uint32_t i;
__m128i *inputPtr = (__m128i*) input;
__m128i in0, in1, in2;
__m128i s0, s1, s2, s;
__m128i p00, p01, p02, p0;
__m128i p10, p11, p12, p1;
__m128i *sysPtr = (__m128i*) h->syst[cbidx];
__m128i *pa0Ptr = (__m128i*) h->parity0[cbidx];
__m128i *pa1Ptr = (__m128i*) h->parity1[cbidx];
// pick bits 0, 3, 6 from 1st word
__m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
// pick bits 1, 4, 7 from 2st word
__m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 2, 5 from 3rd word
__m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 1, 4, 7 from 1st word
__m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
// pick bits 2, 5, from 2st word
__m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 0, 3, 6 from 3rd word
__m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 2, 5 from 1st word
__m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
// pick bits 0, 3, 6, from 2st word
__m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
// pick bits 1, 4, 7 from 3rd word
__m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// Split systematic and parity bits
for (i = 0; i < long_cb/8; i++) {
in0 = _mm_load_si128(inputPtr); inputPtr++;
in1 = _mm_load_si128(inputPtr); inputPtr++;
in2 = _mm_load_si128(inputPtr); inputPtr++;
/* Deinterleave Systematic bits */
s0 = _mm_shuffle_epi8(in0, s0_mask);
s1 = _mm_shuffle_epi8(in1, s1_mask);
s2 = _mm_shuffle_epi8(in2, s2_mask);
s = _mm_or_si128(s0, s1);
s = _mm_or_si128(s, s2);
_mm_store_si128(sysPtr, s);
sysPtr++;
/* Deinterleave parity 0 bits */
p00 = _mm_shuffle_epi8(in0, p00_mask);
p01 = _mm_shuffle_epi8(in1, p01_mask);
p02 = _mm_shuffle_epi8(in2, p02_mask);
p0 = _mm_or_si128(p00, p01);
p0 = _mm_or_si128(p0, p02);
_mm_store_si128(pa0Ptr, p0);
pa0Ptr++;
/* Deinterleave parity 1 bits */
p10 = _mm_shuffle_epi8(in0, p10_mask);
p11 = _mm_shuffle_epi8(in1, p11_mask);
p12 = _mm_shuffle_epi8(in2, p12_mask);
p1 = _mm_or_si128(p10, p11);
p1 = _mm_or_si128(p1, p12);
_mm_store_si128(pa1Ptr, p1);
pa1Ptr++;
}
for (i = 0; i < 3; i++) {
h->syst[cbidx][i+long_cb] = input[3*long_cb + 2*i];
h->parity0[cbidx][i+long_cb] = input[3*long_cb + 2*i + 1];
}
for (i = 0; i < 3; i++) {
h->app2[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i];
h->parity1[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
}
}
/* Runs 1 turbo decoder iteration */
void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb)
{
int16_t *tmp_app[SRSLTE_TDEC_MAX_NPAR];
if (h->current_cbidx >= 0) {
uint16_t *inter = h->interleaver[h->current_cbidx].forward;
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
#ifndef LV_HAVE_AVX2
input[1] = NULL;
#endif
h->cb_mask = (input[0]?1:0) | (input[1]?2:0);
for (int i=0;i<h->max_par_cb;i++) {
if (h->n_iter[i] == 0 && input[i]) {
//printf("deinterleaveing %d\n",i);
deinterleave_input_simd(h, input[i], i, long_cb);
}
}
// Add apriori information to decoder 1
for (int i=0;i<h->max_par_cb;i++) {
if (h->n_iter[i] > 0 && input[i]) {
srslte_vec_sub_sss(h->app1[i], h->ext1[i], h->app1[i], long_cb);
}
}
// Run MAP DEC #1
for (int i=0;i<h->max_par_cb;i++) {
if (input[i]) {
tmp_app[i] = h->n_iter[i]?h->app1[i]:NULL;
} else {
tmp_app[i] = NULL;
}
}
map_simd_dec(&h->dec, h->syst, tmp_app, h->parity0, h->ext1, h->cb_mask, long_cb);
// Convert aposteriori information into extrinsic information
for (int i=0;i<h->max_par_cb;i++) {
if (h->n_iter[i] > 0 && input[i]) {
srslte_vec_sub_sss(h->ext1[i], h->app1[i], h->ext1[i], long_cb);
}
}
// Interleave extrinsic output of DEC1 to form apriori info for decoder 2
for (int i=0;i<h->max_par_cb;i++) {
if (input[i]) {
srslte_vec_lut_sss(h->ext1[i], deinter, h->app2[i], long_cb);
}
}
// Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits
map_simd_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, h->cb_mask, long_cb);
// Deinterleaved extrinsic bits become apriori info for decoder 1
for (int i=0;i<h->max_par_cb;i++) {
if (input[i]) {
srslte_vec_lut_sss(h->ext2[i], inter, h->app1[i], long_cb);
}
}
for (int i=0;i<h->max_par_cb;i++) {
if (input[i]) {
h->n_iter[i]++;
}
}
} else {
fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_reset() first\n");
}
}
/* Resets the decoder and sets the codeblock length */
int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb)
{
if (long_cb > h->max_long_cb) {
fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
h->max_long_cb);
return -1;
}
for (int i=0;i<h->max_par_cb;i++) {
h->n_iter[i] = 0;
}
h->cb_mask = 0;
h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
if (h->current_cbidx < 0) {
fprintf(stderr, "Invalid CB length %d\n", long_cb);
return -1;
}
return 0;
}
int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h, uint32_t cb_idx)
{
h->n_iter[cb_idx] = 0;
return 0;
}
int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h, uint32_t cb_idx)
{
return h->n_iter[cb_idx];
}
void tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
{
__m128i zero = _mm_set1_epi16(0);
__m128i lsb_mask = _mm_set1_epi16(1);
__m128i *appPtr = (__m128i*) h->app1[cbidx];
__m128i *outPtr = (__m128i*) output;
__m128i ap, out, out0, out1;
for (uint32_t i = 0; i < long_cb/16; i++) {
ap = _mm_load_si128(appPtr); appPtr++;
out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
ap = _mm_load_si128(appPtr); appPtr++;
out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
out = _mm_packs_epi16(out0, out1);
_mm_store_si128(outPtr, out);
outPtr++;
}
if (long_cb%16) {
for (int i=0;i<8;i++) {
output[long_cb-8+i] = h->app1[cbidx][long_cb-8+i]>0?1:0;
}
}
}
void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb)
{
for (int i=0;i<h->max_par_cb;i++) {
tdec_simd_decision(h, output[i], i, long_cb);
}
}
void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
{
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
// long_cb is always byte aligned
for (uint32_t i = 0; i < long_cb/8; i++) {
uint8_t out0 = h->app1[cbidx][8*i+0]>0?mask[0]:0;
uint8_t out1 = h->app1[cbidx][8*i+1]>0?mask[1]:0;
uint8_t out2 = h->app1[cbidx][8*i+2]>0?mask[2]:0;
uint8_t out3 = h->app1[cbidx][8*i+3]>0?mask[3]:0;
uint8_t out4 = h->app1[cbidx][8*i+4]>0?mask[4]:0;
uint8_t out5 = h->app1[cbidx][8*i+5]>0?mask[5]:0;
uint8_t out6 = h->app1[cbidx][8*i+6]>0?mask[6]:0;
uint8_t out7 = h->app1[cbidx][8*i+7]>0?mask[7]:0;
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
}
}
void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t long_cb)
{
for (int i=0;i<h->max_par_cb;i++) {
if (output[i]) {
srslte_tdec_simd_decision_byte_cb(h, output[i], i, long_cb);
}
}
}
/* Runs nof_iterations iterations and decides the output bits */
int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_MAX_NPAR], uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t nof_iterations, uint32_t long_cb)
{
if (srslte_tdec_simd_reset(h, long_cb)) {
return SRSLTE_ERROR;
}
do {
srslte_tdec_simd_iteration(h, input, long_cb);
} while (h->n_iter[0] < nof_iterations);
srslte_tdec_simd_decision_byte(h, output, long_cb);
return SRSLTE_SUCCESS;
}
#endif

@ -1,299 +0,0 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include "srslte/phy/fec/turbodecoder_simd_inter.h"
#include "srslte/phy/utils/vector.h"
#define TOTALTAIL 12
#ifdef LV_HAVE_SSE
#include <smmintrin.h>
void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb);
void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb);
void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb);
void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb);
static void map_sse_inter_dec(srslte_tdec_simd_inter_t * h, int16_t * input, int16_t * parity, int16_t * output,
uint32_t long_cb)
{
map_see_inter_alpha(h, input, parity, long_cb);
map_sse_inter_beta(h, input, parity, output, long_cb);
}
/************************************************
*
* TURBO DECODER INTERFACE
*
************************************************/
int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
{
int ret = -1;
bzero(h, sizeof(srslte_tdec_simd_inter_t));
uint32_t len = max_long_cb + 12;
h->max_long_cb = max_long_cb;
h->max_par_cb = max_par_cb;
h->llr1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
if (!h->llr1) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->llr2 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
if (!h->llr2) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->w = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
if (!h->w) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->syst0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
if (!h->syst0) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->syst1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
if (!h->syst1) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
if (!h->parity0) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
if (!h->parity1) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
h->alpha = srslte_vec_malloc(sizeof(int16_t) * 8*(len+12) * h->max_par_cb);
if (!h->alpha) {
perror("srslte_vec_malloc");
goto clean_and_exit;
}
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
goto clean_and_exit;
}
srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
}
h->current_cbidx = -1;
ret = 0;
clean_and_exit:if (ret == -1) {
srslte_tdec_simd_inter_free(h);
}
return ret;
}
void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h)
{
if (h->llr1) {
free(h->llr1);
}
if (h->llr2) {
free(h->llr2);
}
if (h->w) {
free(h->w);
}
if (h->syst0) {
free(h->syst0);
}
if (h->syst1) {
free(h->syst1);
}
if (h->parity0) {
free(h->parity0);
}
if (h->parity1) {
free(h->parity1);
}
if (h->alpha) {
free(h->alpha);
}
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
srslte_tc_interl_free(&h->interleaver[i]);
}
bzero(h, sizeof(srslte_tdec_simd_inter_t));
}
/* Deinterleave for inter-frame parallelization */
void extract_input(srslte_tdec_simd_inter_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb)
{
for (int i=0;i<long_cb;i++) {
h->syst0[h->max_par_cb*i+cbidx] = input[3*i+0];
h->parity0[h->max_par_cb*i+cbidx] = input[3*i+1];
h->parity1[h->max_par_cb*i+cbidx] = input[3*i+2];
}
for (int i = long_cb; i < long_cb + 3; i++) {
h->syst0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)];
h->syst1[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)];
h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 1];
h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 2];
}
}
void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h, int16_t *input[SRSLTE_TDEC_MAX_NPAR], uint32_t nof_cb, uint32_t long_cb)
{
if (h->current_cbidx >= 0) {
uint16_t *inter = h->interleaver[h->current_cbidx].forward;
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
// Prepare systematic and parity bits for MAP DEC #1
for (int i=0;i<nof_cb;i++) {
if (h->n_iter[i] == 0) {
extract_input(h, input[i], i, long_cb);
}
srslte_vec_sum_sss(h->syst0, h->w, h->syst0, long_cb*h->max_par_cb);
}
// Run MAP DEC #1
map_sse_inter_dec(h, h->syst0, h->parity0, h->llr1, long_cb);
// Prepare systematic and parity bits for MAP DEC #1
sse_inter_extract_syst1(h, inter, long_cb);
// Run MAP DEC #2
map_sse_inter_dec(h, h->syst1, h->parity1, h->llr2, long_cb);
// Update a-priori LLR from the last iteration
sse_inter_update_w(h, deinter, long_cb);
} else {
fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_inter_reset() first\n");
}
}
int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h, uint32_t cb_idx)
{
for (int i=0;i<h->current_long_cb;i++) {
h->w[h->max_par_cb*i+cb_idx] = 0;
}
return 0;
}
int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h, uint32_t long_cb)
{
if (long_cb > h->max_long_cb) {
fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
h->max_long_cb);
return -1;
}
h->current_long_cb = long_cb;
h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
if (h->current_cbidx < 0) {
fprintf(stderr, "Invalid CB length %d\n", long_cb);
return -1;
}
memset(h->w, 0, sizeof(int16_t) * long_cb * h->max_par_cb);
return 0;
}
void srslte_tdec_simd_inter_decision_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb)
{
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
uint32_t i;
for (i = 0; i < long_cb; i++) {
output[i] = (h->llr2[h->max_par_cb*deinter[i]+cb_idx] > 0) ? 1 : 0;
}
}
void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t nof_cb, uint32_t long_cb)
{
for (int i=0;i<nof_cb;i++) {
srslte_tdec_simd_inter_decision_cb(h, output[i], i, long_cb);
}
}
void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb)
{
uint32_t i;
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
#define indexOf_cb(idx, cb) (h->max_par_cb*(deinter[8*i+idx])+cb)
// long_cb is always byte aligned
for (i = 0; i < long_cb/8; i++) {
uint8_t out0 = h->llr2[indexOf_cb(0, cb_idx)]>0?mask[0]:0;
uint8_t out1 = h->llr2[indexOf_cb(1, cb_idx)]>0?mask[1]:0;
uint8_t out2 = h->llr2[indexOf_cb(2, cb_idx)]>0?mask[2]:0;
uint8_t out3 = h->llr2[indexOf_cb(3, cb_idx)]>0?mask[3]:0;
uint8_t out4 = h->llr2[indexOf_cb(4, cb_idx)]>0?mask[4]:0;
uint8_t out5 = h->llr2[indexOf_cb(5, cb_idx)]>0?mask[5]:0;
uint8_t out6 = h->llr2[indexOf_cb(6, cb_idx)]>0?mask[6]:0;
uint8_t out7 = h->llr2[indexOf_cb(7, cb_idx)]>0?mask[7]:0;
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
}
}
void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_MAX_NPAR], uint32_t nof_cb, uint32_t long_cb)
{
for (int i=0;i<nof_cb;i++) {
srslte_tdec_simd_inter_decision_byte_cb(h, output[i], i, long_cb);
}
}
int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h,
int16_t *input[SRSLTE_TDEC_MAX_NPAR], uint8_t *output[SRSLTE_TDEC_MAX_NPAR],
uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb)
{
uint32_t iter = 0;
if (srslte_tdec_simd_inter_reset(h, long_cb)) {
return SRSLTE_ERROR;
}
do {
srslte_tdec_simd_inter_iteration(h, input, nof_cb, long_cb);
iter++;
} while (iter < nof_iterations);
srslte_tdec_simd_inter_decision_byte(h, output, nof_cb, long_cb);
return SRSLTE_SUCCESS;
}
#endif

@ -31,13 +31,15 @@
#include <strings.h> #include <strings.h>
#include <math.h> #include <math.h>
#include "srslte/phy/fec/turbodecoder_simd.h" #include "srslte/phy/fec/turbodecoder_sse.h"
#include "srslte/phy/utils/vector.h" #include "srslte/phy/utils/vector.h"
#include <inttypes.h> #include <inttypes.h>
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#include <smmintrin.h> #include <smmintrin.h>
#include <srslte/phy/fec/turbodecoder_sse.h>
#endif #endif
@ -47,21 +49,46 @@
#define TOTALTAIL 12 #define TOTALTAIL 12
#define INF 10000 #define INF 10000
#define ZERO 0
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
/*
#define debug_enabled 0
#if debug_enabled
#define debug_state(c,d) printf("k=%5d, in=%5d, pa=%5d, out=%5d, alpha=", k-d,\
s->branch[2*(k-d)] + s->branch[2*(k-d)+1], \
-s->branch[2*(k-d)] + s->branch[2*(k-d)+1], output[k-d]);print_128i(alpha_k);\
printf(", beta=");print_128i(beta_k);printf("\n");
static void print_128i(__m128i x) { static void print_128i(__m128i x) {
int16_t *s = (int16_t*) &x; int16_t *s = (int16_t*) &x;
printf("[%d", s[0]); printf("[%5d", s[0]);
for (int i=1;i<8;i++) { for (int i=1;i<8;i++) {
printf(",%d", s[i]); printf(",%5d", s[i]);
} }
printf("]\n"); printf("]");
} }
*/
static uint32_t max_128i(__m128i x) {
int16_t *s = (int16_t*) &x;
int16_t m = -INF;
uint32_t max = 0;
for (int i=1;i<8;i++) {
if (s[i] > m) {
max = i;
m = s[i];
}
}
return max;
}
#else
#define debug_state(c,d)
#endif
//#define use_beta_transposed_max //#define use_beta_transposed_max
#ifndef use_beta_transposed_max #ifndef use_beta_transposed_max
@ -75,7 +102,7 @@ static inline int16_t hMax(__m128i buffer)
} }
/* Computes beta values */ /* Computes beta values */
void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) void tdec_sse_beta(tdec_sse_t * s, int16_t * output, uint32_t long_cb)
{ {
int k; int k;
uint32_t end = long_cb + 3; uint32_t end = long_cb + 3;
@ -141,7 +168,9 @@ void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
alphaPtr--;\ alphaPtr--;\
bp = _mm_add_epi16(bp, alpha_k);\ bp = _mm_add_epi16(bp, alpha_k);\
bn = _mm_add_epi16(bn, alpha_k);\ bn = _mm_add_epi16(bn, alpha_k);\
output[k-d] = hMax(bn)-hMax(bp); output[k-d] = hMax(bn)-hMax(bp);\
debug_state(c,d);
/* The tail does not require to load alpha or produce outputs. Only update /* The tail does not require to load alpha or produce outputs. Only update
* beta metrics accordingly */ * beta metrics accordingly */
@ -179,7 +208,7 @@ void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
#endif #endif
/* Computes alpha metrics */ /* Computes alpha metrics */
void map_sse_alpha(map_gen_t * s, uint32_t long_cb) void tdec_sse_alpha(tdec_sse_t * s, uint32_t long_cb)
{ {
uint32_t k; uint32_t k;
int16_t *alpha = s->alpha; int16_t *alpha = s->alpha;
@ -268,7 +297,7 @@ void map_sse_alpha(map_gen_t * s, uint32_t long_cb)
} }
/* Compute branch metrics (gamma) */ /* Compute branch metrics (gamma) */
void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) void tdec_sse_gamma(tdec_sse_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb)
{ {
__m128i res00, res10, res01, res11, res0, res1; __m128i res00, res10, res01, res11, res0, res1;
__m128i in, ap, pa, g1, g0; __m128i in, ap, pa, g1, g0;
@ -313,6 +342,8 @@ void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
resPtr++; resPtr++;
_mm_store_si128(resPtr, res1); _mm_store_si128(resPtr, res1);
resPtr++; resPtr++;
//printf("k=%d, in=%d, pa=%d, g0=%d, g1=%d\n", i, input[i], parity[i], h->branch[2*i], h->branch[2*i+1]);
} }
for (int i=long_cb;i<long_cb+3;i++) { for (int i=long_cb;i<long_cb+3;i++) {
@ -322,6 +353,161 @@ void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity,
} }
/* Inititalizes constituent decoder object */
int tdec_sse_init(void **hh, uint32_t max_long_cb)
{
*hh = calloc(1, sizeof(tdec_sse_t));
tdec_sse_t *h = (tdec_sse_t*) *hh;
h->max_long_cb = max_long_cb;
h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + TOTALTAIL + 1) * NUMSTATES);
if (!h->alpha) {
perror("srslte_vec_malloc");
return -1;
}
h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + TOTALTAIL + 1) * NUMSTATES);
if (!h->branch) {
perror("srslte_vec_malloc");
return -1;
}
return 1;
}
void tdec_sse_free(void *hh)
{
tdec_sse_t *h = (tdec_sse_t*) hh;
if (h->alpha) {
free(h->alpha);
}
if (h->branch) {
free(h->branch);
}
bzero(h, sizeof(tdec_sse_t));
}
/* Runs one instance of a decoder */
void tdec_sse_dec(void *hh, int16_t * input, int16_t *app, int16_t * parity,
int16_t *output, uint32_t long_cb)
{
tdec_sse_t *h = (tdec_sse_t*) hh;
// Compute branch metrics
tdec_sse_gamma(h, input, app, parity, long_cb);
// Forward recursion
tdec_sse_alpha(h, long_cb);
// Backwards recursion + LLR computation
tdec_sse_beta(h, output, long_cb);
}
/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into
* 3 buffers ready to be used by compute_gamma()
*/
void tdec_sse_extract_input(int16_t *input, int16_t *syst0, int16_t *app2, int16_t *parity0, int16_t *parity1, uint32_t long_cb) {
uint32_t i;
__m128i *inputPtr = (__m128i*) input;
__m128i in0, in1, in2;
__m128i s0, s1, s2, s;
__m128i p00, p01, p02, p0;
__m128i p10, p11, p12, p1;
__m128i *sysPtr = (__m128i*) syst0;
__m128i *pa0Ptr = (__m128i*) parity0;
__m128i *pa1Ptr = (__m128i*) parity1;
// pick bits 0, 3, 6 from 1st word
__m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
// pick bits 1, 4, 7 from 2st word
__m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 2, 5 from 3rd word
__m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 1, 4, 7 from 1st word
__m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
// pick bits 2, 5, from 2st word
__m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 0, 3, 6 from 3rd word
__m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// pick bits 2, 5 from 1st word
__m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
// pick bits 0, 3, 6, from 2st word
__m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
// pick bits 1, 4, 7 from 3rd word
__m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
// Split systematic and parity bits
for (i = 0; i < long_cb/8; i++) {
in0 = _mm_load_si128(inputPtr); inputPtr++;
in1 = _mm_load_si128(inputPtr); inputPtr++;
in2 = _mm_load_si128(inputPtr); inputPtr++;
/* Deinterleave Systematic bits */
s0 = _mm_shuffle_epi8(in0, s0_mask);
s1 = _mm_shuffle_epi8(in1, s1_mask);
s2 = _mm_shuffle_epi8(in2, s2_mask);
s = _mm_or_si128(s0, s1);
s = _mm_or_si128(s, s2);
_mm_store_si128(sysPtr, s);
sysPtr++;
/* Deinterleave parity 0 bits */
p00 = _mm_shuffle_epi8(in0, p00_mask);
p01 = _mm_shuffle_epi8(in1, p01_mask);
p02 = _mm_shuffle_epi8(in2, p02_mask);
p0 = _mm_or_si128(p00, p01);
p0 = _mm_or_si128(p0, p02);
_mm_store_si128(pa0Ptr, p0);
pa0Ptr++;
/* Deinterleave parity 1 bits */
p10 = _mm_shuffle_epi8(in0, p10_mask);
p11 = _mm_shuffle_epi8(in1, p11_mask);
p12 = _mm_shuffle_epi8(in2, p12_mask);
p1 = _mm_or_si128(p10, p11);
p1 = _mm_or_si128(p1, p12);
_mm_store_si128(pa1Ptr, p1);
pa1Ptr++;
}
for (i = 0; i < 3; i++) {
syst0[i+long_cb] = input[3*long_cb + 2*i];
parity0[i+long_cb] = input[3*long_cb + 2*i + 1];
}
for (i = 0; i < 3; i++) {
app2[i+long_cb] = input[3*long_cb + 6 + 2*i];
parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
}
}
void tdec_sse_decision_byte(int16_t *app1, uint8_t *output, uint32_t long_cb)
{
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
// long_cb is always byte aligned
for (uint32_t i = 0; i < long_cb/8; i++) {
uint8_t out0 = app1[8*i+0]>0?mask[0]:0;
uint8_t out1 = app1[8*i+1]>0?mask[1]:0;
uint8_t out2 = app1[8*i+2]>0?mask[2]:0;
uint8_t out3 = app1[8*i+3]>0?mask[3]:0;
uint8_t out4 = app1[8*i+4]>0?mask[4]:0;
uint8_t out5 = app1[8*i+5]>0?mask[5]:0;
uint8_t out6 = app1[8*i+6]>0?mask[6]:0;
uint8_t out7 = app1[8*i+7]>0?mask[7]:0;
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
}
}
@ -381,7 +567,7 @@ static inline __m128i transposed_max(__m128i a, __m128i b, __m128i c, __m128i d,
return res; return res;
} }
void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) void tdec_sse_beta(tdec_sse_t * s, int16_t * output, uint32_t long_cb)
{ {
int k; int k;
uint32_t end = long_cb + 3; uint32_t end = long_cb + 3;

@ -1,202 +0,0 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2015 Software Radio Systems Limited
*
* \section LICENSE
*
* This file is part of the srsLTE library.
*
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include "srslte/phy/fec/turbodecoder_simd_inter.h"
#include "srslte/phy/utils/vector.h"
#define NCB 8
#define INF 10000
#ifdef LV_HAVE_SSE
#include <smmintrin.h>
void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb)
{
__m128i *llr1Ptr = (__m128i*) h->llr1;
__m128i *wPtr = (__m128i*) h->w;
__m128i *syst1Ptr = (__m128i*) h->syst1;
for (int i = 0; i < long_cb; i++) {
__m128i llr1 = _mm_load_si128(&llr1Ptr[inter[i]]);
__m128i w = _mm_load_si128(&wPtr[inter[i]]);
_mm_store_si128(syst1Ptr++, _mm_sub_epi16(llr1, w));
}
}
void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb)
{
__m128i *llr1Ptr = (__m128i*) h->llr1;
__m128i *llr2Ptr = (__m128i*) h->llr2;
__m128i *wPtr = (__m128i*) h->w;
__m128i *syst1Ptr = (__m128i*) h->syst1;
for (int i = 0; i < long_cb; i++) {
__m128i llr1 = _mm_load_si128(llr1Ptr++);
__m128i w = _mm_load_si128(wPtr++);
__m128i llr2 = _mm_load_si128(&llr2Ptr[deinter[i]]);
_mm_store_si128(syst1Ptr++, _mm_add_epi16(w, _mm_sub_epi16(llr2, llr1)));
}
}
/* Computes beta values */
void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb)
{
__m128i m_b[8], new[8], old[8], max1[8], max0[8];
__m128i x, y, xy;
__m128i m1, m0;
uint32_t end = long_cb + 3;
uint32_t i;
__m128i *inputPtr = (__m128i*) input;
__m128i *parityPtr = (__m128i*) parity;
__m128i *outputPtr = (__m128i*) output;
__m128i *alphaPtr = (__m128i*) s->alpha;
for (int i = 0; i < 8; i++) {
old[i] = _mm_set1_epi16(0);
}
for (int k = end - 1; k >= 0; k--) {
x = _mm_load_si128(inputPtr++);
y = _mm_load_si128(parityPtr++);
xy = _mm_add_epi16(x,y);
m_b[0] = _mm_add_epi16(old[4], xy);
m_b[1] = old[4];
m_b[2] = _mm_add_epi16(old[5], y);
m_b[3] = _mm_add_epi16(old[5], x);
m_b[4] = _mm_add_epi16(old[6], x);
m_b[5] = _mm_add_epi16(old[6], y);
m_b[6] = old[7];
m_b[7] = _mm_add_epi16(old[7], xy);
new[0] = old[0];
new[1] = _mm_add_epi16(old[0], xy);
new[2] = _mm_add_epi16(old[1], x);
new[3] = _mm_add_epi16(old[1], y);
new[4] = _mm_add_epi16(old[2], y);
new[5] = _mm_add_epi16(old[2], x);
new[6] = _mm_add_epi16(old[3], xy);
new[7] = old[3];
for (i = 0; i < 8; i++) {
__m128i alpha = _mm_load_si128(alphaPtr++);
max0[i] = _mm_add_epi16(alpha, m_b[i]);
max1[i] = _mm_add_epi16(alpha, new[i]);
}
m1 = _mm_max_epi16(max1[0], max1[1]);
m0 = _mm_max_epi16(max0[0], max0[1]);
for (i = 2; i < 8; i++) {
m1 = _mm_max_epi16(m1, max1[i]);
m0 = _mm_max_epi16(m0, max0[i]);
}
for (i = 0; i < 8; i++) {
new[i] = _mm_max_epi16(m_b[i], new[i]);
old[i] = new[i];
}
__m128i out = _mm_sub_epi16(m1, m0);
_mm_store_si128(outputPtr++, out);
// normalize
if ((k%4)==0) {
for (int i=1;i<8;i++) {
_mm_sub_epi16(old[i], old[0]);
}
}
}
}
/* Computes alpha metrics */
void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb)
{
__m128i m_b[8], new[8], old[8];
__m128i x, y, xy;
uint32_t k;
__m128i *inputPtr = (__m128i*) input;
__m128i *parityPtr = (__m128i*) parity;
__m128i *alphaPtr = (__m128i*) s->alpha;
old[0] = _mm_set1_epi16(0);
for (int i = 1; i < 8; i++) {
old[i] = _mm_set1_epi16(-INF);
}
for (k = 0; k < long_cb; k++) {
x = _mm_load_si128(inputPtr++);
y = _mm_load_si128(parityPtr++);
xy = _mm_add_epi16(x,y);
m_b[0] = old[0];
m_b[1] = _mm_add_epi16(old[3], y);
m_b[2] = _mm_add_epi16(old[4], y);
m_b[3] = old[7];
m_b[4] = old[1];
m_b[5] = _mm_add_epi16(old[2], y);
m_b[6] = _mm_add_epi16(old[5], y);
m_b[7] = old[6];
new[0] = _mm_add_epi16(old[1], xy);
new[1] = _mm_add_epi16(old[2], x);
new[2] = _mm_add_epi16(old[5], x);
new[3] = _mm_add_epi16(old[6], xy);
new[4] = _mm_add_epi16(old[0], xy);
new[5] = _mm_add_epi16(old[3], x);
new[6] = _mm_add_epi16(old[4], x);
new[7] = _mm_add_epi16(old[7], xy);
for (int i = 0; i < 8; i++) {
new[i] = _mm_max_epi16(m_b[i], new[i]);
old[i] = new[i];
_mm_store_si128(alphaPtr++, old[i]);
}
// normalize
if ((k%4)==0) {
for (int i=1;i<8;i++) {
_mm_sub_epi16(old[i], old[0]);
}
}
}
}
#endif

@ -32,18 +32,26 @@
#include "srslte/phy/utils/bit.h" #include "srslte/phy/utils/bit.h"
#include "srslte/phy/modem/demod_soft.h" #include "srslte/phy/modem/demod_soft.h"
// AVX implementation not useful for integers. Wait for AVX2
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#include <smmintrin.h> #include <smmintrin.h>
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols); void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols);
#endif #endif
#define SCALE_SHORT_CONV_QPSK 100 #define SCALE_SHORT_CONV_QPSK 100
#define SCALE_SHORT_CONV_QAM16 400 #define SCALE_SHORT_CONV_QAM16 400
#define SCALE_SHORT_CONV_QAM64 700 #define SCALE_SHORT_CONV_QAM64 700
#define SCALE_BYTE_CONV_QPSK 20
#define SCALE_BYTE_CONV_QAM16 30
#define SCALE_BYTE_CONV_QAM64 40
void demod_bpsk_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols) {
for (int i=0;i<nsymbols;i++) {
llr[i] = (int8_t) -SCALE_BYTE_CONV_QPSK*(crealf(symbols[i]) + cimagf(symbols[i]))/sqrt(2);
}
}
void demod_bpsk_lte_s(const cf_t *symbols, short *llr, int nsymbols) { void demod_bpsk_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
for (int i=0;i<nsymbols;i++) { for (int i=0;i<nsymbols;i++) {
llr[i] = (short) -SCALE_SHORT_CONV_QPSK*(crealf(symbols[i]) + cimagf(symbols[i]))/sqrt(2); llr[i] = (short) -SCALE_SHORT_CONV_QPSK*(crealf(symbols[i]) + cimagf(symbols[i]))/sqrt(2);
@ -56,6 +64,10 @@ void demod_bpsk_lte(const cf_t *symbols, float *llr, int nsymbols) {
} }
} }
void demod_qpsk_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols) {
srslte_vec_convert_fb((const float*) symbols, -SCALE_BYTE_CONV_QPSK*sqrt(2), llr, nsymbols*2);
}
void demod_qpsk_lte_s(const cf_t *symbols, short *llr, int nsymbols) { void demod_qpsk_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
srslte_vec_convert_fi((const float*) symbols, -SCALE_SHORT_CONV_QPSK*sqrt(2), llr, nsymbols*2); srslte_vec_convert_fi((const float*) symbols, -SCALE_SHORT_CONV_QPSK*sqrt(2), llr, nsymbols*2);
} }
@ -87,9 +99,11 @@ void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) {
__m128i result11, result12, result22, result21; __m128i result11, result12, result22, result21;
__m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM16); __m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM16);
__m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0); __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0);
__m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8);
__m128i shuffle_abs_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff); __m128i shuffle_abs_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff);
__m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8);
__m128i shuffle_abs_2 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff); __m128i shuffle_abs_2 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff);
for (int i=0;i<nsymbols/4;i++) { for (int i=0;i<nsymbols/4;i++) {
symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
@ -120,6 +134,60 @@ void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) {
llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10); llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
} }
} }
void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) {
float *symbolsPtr = (float*) symbols;
__m128i *resultPtr = (__m128i*) llr;
__m128 symbol1, symbol2, symbol3, symbol4;
__m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_12, symbol_34;
__m128i offset = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM16/sqrt(10));
__m128i result1n, result1a, result2n, result2a;
__m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM16);
__m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
__m128i shuffle_abs_1 = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
__m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
__m128i shuffle_abs_2 = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
for (int i=0;i<nsymbols/8;i++) {
symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol3 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol4 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
symbol_12 = _mm_packs_epi32(symbol_i1, symbol_i2);
symbol_34 = _mm_packs_epi32(symbol_i3, symbol_i4);
symbol_i = _mm_packs_epi16(symbol_12, symbol_34);
symbol_abs = _mm_abs_epi8(symbol_i);
symbol_abs = _mm_sub_epi8(symbol_abs, offset);
result1n = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
result1a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);
result2n = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
result2a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);
_mm_store_si128(resultPtr, _mm_or_si128(result1n, result1a)); resultPtr++;
_mm_store_si128(resultPtr, _mm_or_si128(result2n, result2a)); resultPtr++;
}
// Demodulate last symbols
for (int i=8*(nsymbols/8);i<nsymbols;i++) {
short yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i]));
short yim = (int8_t) (SCALE_BYTE_CONV_QAM16*cimagf(symbols[i]));
llr[4*i+0] = -yre;
llr[4*i+1] = -yim;
llr[4*i+2] = abs(yre)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
}
}
#endif #endif
void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) { void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
@ -138,6 +206,22 @@ void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
#endif #endif
} }
void demod_16qam_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols) {
#ifdef LV_HAVE_SSE
demod_16qam_lte_b_sse(symbols, llr, nsymbols);
#else
for (int i=0;i<nsymbols;i++) {
int8_t yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i]));
int8_t yim = (int8_t) (SCALE_BYTE_CONV_QAM16*cimagf(symbols[i]));
llr[4*i+0] = -yre;
llr[4*i+1] = -yim;
llr[4*i+2] = abs(yre)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
}
#endif
}
void demod_64qam_lte(const cf_t *symbols, float *llr, int nsymbols) void demod_64qam_lte(const cf_t *symbols, float *llr, int nsymbols)
{ {
for (int i=0;i<nsymbols;i++) { for (int i=0;i<nsymbols;i++) {
@ -219,6 +303,76 @@ void demod_64qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols)
} }
} }
void demod_64qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols)
{
float *symbolsPtr = (float*) symbols;
__m128i *resultPtr = (__m128i*) llr;
__m128 symbol1, symbol2, symbol3, symbol4;
__m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_abs2,symbol_12, symbol_34;
__m128i offset1 = _mm_set1_epi8(4*SCALE_BYTE_CONV_QAM64/sqrt(42));
__m128i offset2 = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM64/sqrt(42));
__m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM64);
__m128i result11, result12, result13, result22, result21,result23, result31, result32, result33;
__m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0);
__m128i shuffle_negated_2 = _mm_set_epi8(11,10,0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff);
__m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff);
__m128i shuffle_abs_1 = _mm_set_epi8(5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff);
__m128i shuffle_abs_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff);
__m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10);
__m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff,0xff,0xff);
__m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff,5,4);
__m128i shuffle_abs2_3 = _mm_set_epi8(15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10,0xff,0xff);
for (int i=0;i<nsymbols/8;i++) {
symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol3 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol4 = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
symbol_12 = _mm_packs_epi32(symbol_i1, symbol_i2);
symbol_34 = _mm_packs_epi32(symbol_i3, symbol_i4);
symbol_i = _mm_packs_epi16(symbol_12, symbol_34);
symbol_abs = _mm_abs_epi8(symbol_i);
symbol_abs = _mm_sub_epi8(symbol_abs, offset1);
symbol_abs2 = _mm_sub_epi8(_mm_abs_epi8(symbol_abs), offset2);
result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);
result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1);
result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);
result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2);
result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3);
result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3);
result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3);
_mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++;
_mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++;
_mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++;
}
for (int i=8*(nsymbols/8);i<nsymbols;i++) {
float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i]));
float yim = (int8_t) (SCALE_BYTE_CONV_QAM64*cimagf(symbols[i]));
llr[6*i+0] = -yre;
llr[6*i+1] = -yim;
llr[6*i+2] = abs(yre)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
llr[6*i+3] = abs(yim)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
}
}
#endif #endif
void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols)
@ -240,6 +394,25 @@ void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols)
#endif #endif
} }
void demod_64qam_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols)
{
#ifdef LV_HAVE_SSE
demod_64qam_lte_b_sse(symbols, llr, nsymbols);
#else
for (int i=0;i<nsymbols;i++) {
float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i]));
float yim = (int8_t) (SCALE_BYTE_CONV_QAM64*cimagf(symbols[i]));
llr[6*i+0] = -yre;
llr[6*i+1] = -yim;
llr[6*i+2] = abs(yre)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
llr[6*i+3] = abs(yim)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
}
#endif
}
int srslte_demod_soft_demodulate(srslte_mod_t modulation, const cf_t* symbols, float* llr, int nsymbols) { int srslte_demod_soft_demodulate(srslte_mod_t modulation, const cf_t* symbols, float* llr, int nsymbols) {
switch(modulation) { switch(modulation) {
case SRSLTE_MOD_BPSK: case SRSLTE_MOD_BPSK:
@ -281,3 +454,24 @@ int srslte_demod_soft_demodulate_s(srslte_mod_t modulation, const cf_t* symbols,
} }
return 0; return 0;
} }
int srslte_demod_soft_demodulate_b(srslte_mod_t modulation, const cf_t* symbols, int8_t* llr, int nsymbols) {
switch(modulation) {
case SRSLTE_MOD_BPSK:
demod_bpsk_lte_b(symbols, llr, nsymbols);
break;
case SRSLTE_MOD_QPSK:
demod_qpsk_lte_b(symbols, llr, nsymbols);
break;
case SRSLTE_MOD_16QAM:
demod_16qam_lte_b(symbols, llr, nsymbols);
break;
case SRSLTE_MOD_64QAM:
demod_64qam_lte_b(symbols, llr, nsymbols);
break;
default:
fprintf(stderr, "Invalid modulation %d\n", modulation);
return -1;
}
return 0;
}

@ -112,6 +112,7 @@ int main(int argc, char **argv) {
cf_t *symbols; cf_t *symbols;
float *llr; float *llr;
short *llr_s; short *llr_s;
int8_t *llr_b;
parse_args(argc, argv); parse_args(argc, argv);
@ -153,6 +154,12 @@ int main(int argc, char **argv) {
exit(-1); exit(-1);
} }
llr_b = srslte_vec_malloc(sizeof(int8_t) * num_bits);
if (!llr_b) {
perror("malloc");
exit(-1);
}
/* generate random data */ /* generate random data */
srand(0); srand(0);
@ -160,6 +167,7 @@ int main(int argc, char **argv) {
struct timeval t[3]; struct timeval t[3];
float mean_texec = 0.0; float mean_texec = 0.0;
float mean_texec_s = 0.0; float mean_texec_s = 0.0;
float mean_texec_b = 0.0;
for (int n=0;n<nof_frames;n++) { for (int n=0;n<nof_frames;n++) {
for (i=0;i<num_bits;i++) { for (i=0;i<num_bits;i++) {
input[i] = rand()%2; input[i] = rand()%2;
@ -187,6 +195,15 @@ int main(int argc, char **argv) {
mean_texec_s = SRSLTE_VEC_CMA((float) t[0].tv_usec, mean_texec_s, n-1); mean_texec_s = SRSLTE_VEC_CMA((float) t[0].tv_usec, mean_texec_s, n-1);
} }
gettimeofday(&t[1], NULL);
srslte_demod_soft_demodulate_b(modulation, symbols, llr_b, num_bits / mod.nbits_x_symbol);
gettimeofday(&t[2], NULL);
get_time_interval(t);
if (n > 0) {
mean_texec_b = SRSLTE_VEC_CMA((float) t[0].tv_usec, mean_texec_b, n-1);
}
if (SRSLTE_VERBOSE_ISDEBUG()) { if (SRSLTE_VERBOSE_ISDEBUG()) {
printf("bits="); printf("bits=");
srslte_vec_fprint_b(stdout, input, num_bits); srslte_vec_fprint_b(stdout, input, num_bits);
@ -200,6 +217,9 @@ int main(int argc, char **argv) {
printf("llr_s="); printf("llr_s=");
srslte_vec_fprint_s(stdout, llr_s, num_bits); srslte_vec_fprint_s(stdout, llr_s, num_bits);
printf("llr_b=");
srslte_vec_fprint_bs(stdout, llr_b, num_bits);
} }
// Check demodulation errors // Check demodulation errors
@ -213,6 +233,8 @@ int main(int argc, char **argv) {
ret = 0; ret = 0;
clean_exit: clean_exit:
free(llr_b);
free(llr_s);
free(llr); free(llr);
free(symbols); free(symbols);
free(output); free(output);
@ -220,7 +242,7 @@ clean_exit:
srslte_modem_table_free(&mod); srslte_modem_table_free(&mod);
printf("Mean Throughput: %.2f/%.2f. Mbps ExTime: %.2f/%.2f us\n", printf("Mean Throughput: %.2f/%.2f/%.2f. Mbps ExTime: %.2f/%.2f/%.2f us\n",
num_bits/mean_texec, num_bits/mean_texec_s, mean_texec, mean_texec_s); num_bits/mean_texec, num_bits/mean_texec_s, num_bits/mean_texec_b, mean_texec, mean_texec_s, mean_texec_b);
exit(ret); exit(ret);
} }

@ -669,31 +669,10 @@ static int srslte_pdsch_codeword_encode(srslte_pdsch_t *q, srslte_pdsch_cfg_t *c
return SRSLTE_SUCCESS; return SRSLTE_SUCCESS;
} }
static int srslte_pdsch_codeword_decode(srslte_pdsch_t *q, srslte_pdsch_cfg_t *cfg, srslte_sch_t *dl_sch, static void csi_correction(srslte_pdsch_t *q, srslte_pdsch_cfg_t *cfg, uint32_t codeword_idx, uint32_t tb_idx, void *e)
srslte_softbuffer_rx_t *softbuffer, uint16_t rnti, uint8_t *data, {
uint32_t codeword_idx, uint32_t tb_idx, bool *ack) {
srslte_ra_nbits_t *nbits = &cfg->nbits[tb_idx];
srslte_ra_mcs_t *mcs = &cfg->grant.mcs[tb_idx];
uint32_t rv = cfg->rv[tb_idx];
int ret = SRSLTE_ERROR_INVALID_INPUTS;
if (softbuffer && data && ack && nbits->nof_bits && nbits->nof_re) {
INFO("Decoding PDSCH SF: %d (CW%d -> TB%d), Mod %s, NofBits: %d, NofSymbols: %d, NofBitsE: %d, rv_idx: %d\n",
cfg->sf_idx, codeword_idx, tb_idx, srslte_mod_string(mcs->mod), mcs->tbs,
nbits->nof_re, nbits->nof_bits, rv);
/* demodulate symbols
* The MAX-log-MAP algorithm used in turbo decoding is unsensitive to SNR estimation,
* thus we don't need tot set it in the LLRs normalization
*/
srslte_demod_soft_demodulate_s(mcs->mod, q->d[codeword_idx], q->e[codeword_idx], nbits->nof_re);
/* Select scrambling sequence */
srslte_sequence_t *seq = get_user_sequence(q, rnti, codeword_idx, cfg->sf_idx, nbits->nof_bits);
/* Bit scrambling */
srslte_scrambling_s_offset(seq, q->e[codeword_idx], 0, nbits->nof_bits);
srslte_ra_nbits_t *nbits = &cfg->nbits[tb_idx];
uint32_t qm = 0; uint32_t qm = 0;
switch(cfg->grant.mcs[tb_idx].mod) { switch(cfg->grant.mcs[tb_idx].mod) {
@ -713,22 +692,64 @@ static int srslte_pdsch_codeword_decode(srslte_pdsch_t *q, srslte_pdsch_cfg_t *c
ERROR("No modulation"); ERROR("No modulation");
} }
int16_t *e = q->e[codeword_idx];
if (q->csi_enabled) {
const uint32_t csi_max_idx = srslte_vec_max_fi(q->csi[codeword_idx], nbits->nof_bits / qm); const uint32_t csi_max_idx = srslte_vec_max_fi(q->csi[codeword_idx], nbits->nof_bits / qm);
float csi_max = 1.0f; float csi_max = 1.0f;
if (csi_max_idx < nbits->nof_bits / qm) { if (csi_max_idx < nbits->nof_bits / qm) {
csi_max = q->csi[codeword_idx][csi_max_idx]; csi_max = q->csi[codeword_idx][csi_max_idx];
} }
int8_t *e_b = e;
int16_t *e_s = e;
for (int i = 0; i < nbits->nof_bits / qm; i++) { for (int i = 0; i < nbits->nof_bits / qm; i++) {
const float csi = q->csi[codeword_idx][i] / csi_max; const float csi = q->csi[codeword_idx][i] / csi_max;
if (q->llr_is_8bit) {
for (int k = 0; k < qm; k++) { for (int k = 0; k < qm; k++) {
e[qm * i + k] = (int16_t) ((float) e[qm * i + k] * csi); e_b[qm * i + k] = (int8_t) ((float) e_b[qm * i + k] * csi);
}
} else {
for (int k = 0; k < qm; k++) {
e_s[qm * i + k] = (int16_t) ((float) e_s[qm * i + k] * csi);
}
} }
} }
} }
static int srslte_pdsch_codeword_decode(srslte_pdsch_t *q, srslte_pdsch_cfg_t *cfg, srslte_sch_t *dl_sch,
srslte_softbuffer_rx_t *softbuffer, uint16_t rnti, uint8_t *data,
uint32_t codeword_idx, uint32_t tb_idx, bool *ack) {
srslte_ra_nbits_t *nbits = &cfg->nbits[tb_idx];
srslte_ra_mcs_t *mcs = &cfg->grant.mcs[tb_idx];
uint32_t rv = cfg->rv[tb_idx];
int ret = SRSLTE_ERROR_INVALID_INPUTS;
if (softbuffer && data && ack && nbits->nof_bits && nbits->nof_re) {
INFO("Decoding PDSCH SF: %d (CW%d -> TB%d), Mod %s, NofBits: %d, NofSymbols: %d, NofBitsE: %d, rv_idx: %d\n",
cfg->sf_idx, codeword_idx, tb_idx, srslte_mod_string(mcs->mod), mcs->tbs,
nbits->nof_re, nbits->nof_bits, rv);
/* demodulate symbols
* The MAX-log-MAP algorithm used in turbo decoding is unsensitive to SNR estimation,
* thus we don't need tot set it in the LLRs normalization
*/
if (q->llr_is_8bit) {
srslte_demod_soft_demodulate_b(mcs->mod, q->d[codeword_idx], q->e[codeword_idx], nbits->nof_re);
} else {
srslte_demod_soft_demodulate_s(mcs->mod, q->d[codeword_idx], q->e[codeword_idx], nbits->nof_re);
}
/* Select scrambling sequence */
srslte_sequence_t *seq = get_user_sequence(q, rnti, codeword_idx, cfg->sf_idx, nbits->nof_bits);
/* Bit scrambling */
if (q->llr_is_8bit) {
srslte_scrambling_sb_offset(seq, q->e[codeword_idx], 0, nbits->nof_bits);
} else {
srslte_scrambling_s_offset(seq, q->e[codeword_idx], 0, nbits->nof_bits);
}
if (q->csi_enabled) {
csi_correction(q, cfg, codeword_idx, tb_idx, q->e[codeword_idx]);
}
/* Return */ /* Return */
ret = srslte_dlsch_decode2(dl_sch, cfg, softbuffer, q->e[codeword_idx], data, tb_idx); ret = srslte_dlsch_decode2(dl_sch, cfg, softbuffer, q->e[codeword_idx], data, tb_idx);

@ -603,7 +603,11 @@ int srslte_pusch_decode(srslte_pusch_t *q,
srslte_dft_precoding(&q->dft_precoding, q->z, q->d, cfg->grant.L_prb, cfg->nbits.nof_symb); srslte_dft_precoding(&q->dft_precoding, q->z, q->d, cfg->grant.L_prb, cfg->nbits.nof_symb);
// Soft demodulation // Soft demodulation
if (q->llr_is_8bit) {
srslte_demod_soft_demodulate_b(cfg->grant.mcs.mod, q->d, q->q, cfg->nbits.nof_re);
} else {
srslte_demod_soft_demodulate_s(cfg->grant.mcs.mod, q->d, q->q, cfg->nbits.nof_re); srslte_demod_soft_demodulate_s(cfg->grant.mcs.mod, q->d, q->q, cfg->nbits.nof_re);
}
// Generate scrambling sequence if not pre-generated // Generate scrambling sequence if not pre-generated
srslte_sequence_t *seq = get_user_sequence(q, rnti, cfg->sf_idx, cfg->nbits.nof_bits); srslte_sequence_t *seq = get_user_sequence(q, rnti, cfg->sf_idx, cfg->nbits.nof_bits);
@ -632,7 +636,11 @@ int srslte_pusch_decode(srslte_pusch_t *q,
} }
// Descrambling // Descrambling
if (q->llr_is_8bit) {
srslte_scrambling_sb_offset(seq, q->q, 0, cfg->nbits.nof_bits);
} else {
srslte_scrambling_s_offset(seq, q->q, 0, cfg->nbits.nof_bits); srslte_scrambling_s_offset(seq, q->q, 0, cfg->nbits.nof_bits);
}
// Decode // Decode
ret = srslte_ulsch_uci_decode(&q->ul_sch, cfg, softbuffer, q->q, q->g, data, uci_data); ret = srslte_ulsch_uci_decode(&q->ul_sch, cfg, softbuffer, q->q, q->g, data, uci_data);

@ -32,12 +32,18 @@
#include <stdbool.h> #include <stdbool.h>
#include <assert.h> #include <assert.h>
#include <math.h> #include <math.h>
#include <srslte/phy/phch/sch.h>
#include "srslte/phy/phch/pdsch.h" #include "srslte/phy/phch/pdsch.h"
#include "srslte/phy/utils/bit.h" #include "srslte/phy/utils/bit.h"
#include "srslte/phy/utils/debug.h" #include "srslte/phy/utils/debug.h"
#include "srslte/phy/utils/vector.h" #include "srslte/phy/utils/vector.h"
#define SRSLTE_PDSCH_MAX_TDEC_ITERS 4 #define SRSLTE_PDSCH_MAX_TDEC_ITERS 10
#ifdef LV_HAVE_SSE
#include <immintrin.h>
#endif /* LV_HAVE_SSE */
/* 36.213 Table 8.6.3-1: Mapping of HARQ-ACK offset values and the index signalled by higher layers */ /* 36.213 Table 8.6.3-1: Mapping of HARQ-ACK offset values and the index signalled by higher layers */
float beta_harq_offset[16] = {2.0, 2.5, 3.125, 4.0, 5.0, 6.250, 8.0, 10.0, float beta_harq_offset[16] = {2.0, 2.5, 3.125, 4.0, 5.0, 6.250, 8.0, 10.0,
@ -184,8 +190,6 @@ static int encode_tb_off(srslte_sch_t *q,
uint32_t Qm, uint32_t rv, uint32_t nof_e_bits, uint32_t Qm, uint32_t rv, uint32_t nof_e_bits,
uint8_t *data, uint8_t *e_bits, uint32_t w_offset) uint8_t *data, uint8_t *e_bits, uint32_t w_offset)
{ {
uint8_t parity[3] = {0, 0, 0};
uint32_t par;
uint32_t i; uint32_t i;
uint32_t cb_len=0, rp=0, wp=0, rlen=0, n_e=0; uint32_t cb_len=0, rp=0, wp=0, rlen=0, n_e=0;
int ret = SRSLTE_ERROR_INVALID_INPUTS; int ret = SRSLTE_ERROR_INVALID_INPUTS;
@ -213,16 +217,8 @@ static int encode_tb_off(srslte_sch_t *q,
gamma = Gp%cb_segm->C; gamma = Gp%cb_segm->C;
} }
if (data) { /* Reset TB CRC */
srslte_crc_set_init(&q->crc_tb, 0);
/* Compute transport block CRC */
par = srslte_crc_checksum_byte(&q->crc_tb, data, cb_segm->tbs);
/* parity bits will be appended later */
parity[0] = (par&(0xff<<16))>>16;
parity[1] = (par&(0xff<<8))>>8;
parity[2] = par&0xff;
}
wp = 0; wp = 0;
rp = 0; rp = 0;
@ -252,6 +248,7 @@ static int encode_tb_off(srslte_sch_t *q,
cb_len, rlen, wp, rp, n_e); cb_len, rlen, wp, rp, n_e);
if (data) { if (data) {
bool last_cb = false;
/* Copy data to another buffer, making space for the Codeblock CRC */ /* Copy data to another buffer, making space for the Codeblock CRC */
if (i < cb_segm->C - 1) { if (i < cb_segm->C - 1) {
@ -263,13 +260,19 @@ static int encode_tb_off(srslte_sch_t *q,
/* Append Transport Block parity bits to the last CB */ /* Append Transport Block parity bits to the last CB */
memcpy(q->cb_in, &data[rp/8], (rlen - 24) * sizeof(uint8_t)/8); memcpy(q->cb_in, &data[rp/8], (rlen - 24) * sizeof(uint8_t)/8);
memcpy(&q->cb_in[(rlen - 24)/8], parity, 3 * sizeof(uint8_t)); last_cb = true;
} }
/* Turbo Encoding /* Turbo Encoding
* If Codeblock CRC is required it is given the CRC instance pointer, otherwise CRC pointer shall be NULL * If Codeblock CRC is required it is given the CRC instance pointer, otherwise CRC pointer shall be NULL
*/ */
srslte_tcod_encode_lut(&q->encoder, (cb_segm->C > 1) ? &q->crc_cb : NULL, q->cb_in, q->parity_bits, cblen_idx); srslte_tcod_encode_lut(&q->encoder,
&q->crc_tb,
(cb_segm->C > 1) ? &q->crc_cb : NULL,
q->cb_in,
q->parity_bits,
cblen_idx,
last_cb);
} }
DEBUG("RM cblen_idx=%d, n_e=%d, wp=%d, nof_e_bits=%d\n",cblen_idx, n_e, wp, nof_e_bits); DEBUG("RM cblen_idx=%d, n_e=%d, wp=%d, nof_e_bits=%d\n",cblen_idx, n_e, wp, nof_e_bits);
@ -304,88 +307,65 @@ static int encode_tb(srslte_sch_t *q,
bool decode_tb_cb(srslte_sch_t *q, bool decode_tb_cb(srslte_sch_t *q,
srslte_softbuffer_rx_t *softbuffer, srslte_cbsegm_t *cb_segm, srslte_softbuffer_rx_t *softbuffer, srslte_cbsegm_t *cb_segm,
uint32_t Qm, uint32_t rv, uint32_t nof_e_bits, uint32_t Qm, uint32_t rv, uint32_t nof_e_bits,
int16_t *e_bits, uint8_t *data, void *e_bits, uint8_t *data)
uint32_t cb_size_group)
{ {
bool cb_map[SRSLTE_MAX_CODEBLOCKS]; int8_t *e_bits_b = e_bits;
int16_t *e_bits_s = e_bits;
uint32_t cb_idx[SRSLTE_TDEC_MAX_NPAR];
int16_t *decoder_input[SRSLTE_TDEC_MAX_NPAR];
uint32_t nof_cb = cb_size_group?cb_segm->C2:cb_segm->C1; if (cb_segm->C > SRSLTE_MAX_CODEBLOCKS) {
uint32_t first_cb = cb_size_group?cb_segm->C1:0;
uint32_t cb_len = cb_size_group?cb_segm->K2:cb_segm->K1;
uint32_t cb_len_idx = cb_size_group?cb_segm->K2_idx:cb_segm->K1_idx;
uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24);
uint32_t Gp = nof_e_bits / Qm;
uint32_t gamma = cb_segm->C>0?Gp%cb_segm->C:Gp;
uint32_t n_e = Qm * (Gp/cb_segm->C);
if (nof_cb > SRSLTE_MAX_CODEBLOCKS) {
fprintf(stderr, "Error SRSLTE_MAX_CODEBLOCKS=%d\n", SRSLTE_MAX_CODEBLOCKS); fprintf(stderr, "Error SRSLTE_MAX_CODEBLOCKS=%d\n", SRSLTE_MAX_CODEBLOCKS);
return false; return false;
} }
for (int i=0;i<srslte_tdec_get_nof_parallel(&q->decoder);i++) {
cb_idx[i] = i+first_cb;
decoder_input[i] = NULL;
}
uint32_t remaining_cb = 0;
for (int i=0;i<nof_cb;i++) {
/* Do not process blocks with CRC Ok */
cb_map[i] = softbuffer->cb_crc[i];
if (softbuffer->cb_crc[i] == false) {
remaining_cb ++;
}
}
srslte_tdec_reset(&q->decoder, cb_len);
q->nof_iterations = 0; q->nof_iterations = 0;
while(remaining_cb>0) { for (int cb_idx=0;cb_idx<cb_segm->C;cb_idx++)
{
/* Do not process blocks with CRC Ok */
if (softbuffer->cb_crc[cb_idx] == false) {
// Unratematch the codeblocks left to decode uint32_t cb_len = cb_idx<cb_segm->C1?cb_segm->K1:cb_segm->K2;
for (int i=0;i<srslte_tdec_get_nof_parallel(&q->decoder);i++) { uint32_t cb_len_idx = cb_idx<cb_segm->C1?cb_segm->K1_idx:cb_segm->K2_idx;
if (!decoder_input[i] && remaining_cb > 0) { uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24);
// Find an unprocessed CB uint32_t Gp = nof_e_bits / Qm;
cb_idx[i]=first_cb; uint32_t gamma = cb_segm->C>0?Gp%cb_segm->C:Gp;
while(cb_idx[i]<first_cb+nof_cb-1 && cb_map[cb_idx[i]]) { uint32_t n_e = Qm * (Gp/cb_segm->C);
cb_idx[i]++;
}
if (cb_map[cb_idx[i]] == false) {
cb_map[cb_idx[i]] = true;
uint32_t rp = cb_idx[i]*n_e; uint32_t rp = cb_idx*n_e;
uint32_t n_e2 = n_e; uint32_t n_e2 = n_e;
if (cb_idx[i] > cb_segm->C - gamma) { if (cb_idx > cb_segm->C - gamma) {
n_e2 = n_e+Qm; n_e2 = n_e+Qm;
rp = (cb_segm->C - gamma)*n_e + (cb_idx[i]-(cb_segm->C - gamma))*n_e2; rp = (cb_segm->C - gamma)*n_e + (cb_idx-(cb_segm->C - gamma))*n_e2;
} }
INFO("CB %d: rp=%d, n_e=%d, i=%d\n", cb_idx[i], rp, n_e2, i); if (q->llr_is_8bit) {
if (srslte_rm_turbo_rx_lut(&e_bits[rp], softbuffer->buffer_f[cb_idx[i]], n_e2, cb_len_idx, rv)) { if (srslte_rm_turbo_rx_lut_8bit(&e_bits_b[rp], (int8_t*) softbuffer->buffer_f[cb_idx], n_e2, cb_len_idx, rv)) {
fprintf(stderr, "Error in rate matching\n"); fprintf(stderr, "Error in rate matching\n");
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
} else {
decoder_input[i] = softbuffer->buffer_f[cb_idx[i]]; if (srslte_rm_turbo_rx_lut(&e_bits_s[rp], softbuffer->buffer_f[cb_idx], n_e2, cb_len_idx, rv)) {
} fprintf(stderr, "Error in rate matching\n");
return SRSLTE_ERROR;
} }
} }
// Run 1 iteration for the codeblocks in queue srslte_tdec_new_cb(&q->decoder, cb_len);
srslte_tdec_iteration_par(&q->decoder, decoder_input, cb_len);
// Decide output bits and compute CRC // Run iterations and use CRC for early stopping
for (int i=0;i<srslte_tdec_get_nof_parallel(&q->decoder);i++) { bool early_stop = false;
if (decoder_input[i]) { uint32_t cb_noi = 0;
srslte_tdec_decision_byte_par_cb(&q->decoder, q->cb_in, i, cb_len); do {
if (q->llr_is_8bit) {
srslte_tdec_iteration_8bit(&q->decoder, (int8_t*) softbuffer->buffer_f[cb_idx], &data[cb_idx*rlen/8]);
} else {
srslte_tdec_iteration(&q->decoder, softbuffer->buffer_f[cb_idx], &data[cb_idx*rlen/8]);
}
q->nof_iterations++;
cb_noi++;
uint32_t len_crc; uint32_t len_crc;
srslte_crc_t *crc_ptr; srslte_crc_t *crc_ptr;
@ -399,47 +379,45 @@ bool decode_tb_cb(srslte_sch_t *q,
} }
// CRC is OK // CRC is OK
if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) { if (!srslte_crc_checksum_byte(crc_ptr, &data[cb_idx*rlen/8], len_crc)) {
memcpy(softbuffer->data[cb_idx[i]], q->cb_in, rlen/8 * sizeof(uint8_t)); softbuffer->cb_crc[cb_idx] = true;
softbuffer->cb_crc[cb_idx[i]] = true; early_stop = true;
q->nof_iterations += srslte_tdec_get_nof_iterations_cb(&q->decoder, i);
// Reset number of iterations for that CB in the decoder
srslte_tdec_reset_cb(&q->decoder, i);
remaining_cb--;
decoder_input[i] = NULL;
cb_idx[i] = 0;
// CRC is error and exceeded maximum iterations for this CB. // CRC is error and exceeded maximum iterations for this CB.
// Early stop the whole transport block. // Early stop the whole transport block.
} else if (srslte_tdec_get_nof_iterations_cb(&q->decoder, i) >= q->max_iterations) {
INFO("CB %d: Error. CB is erroneous. remaining_cb=%d, i=%d, first_cb=%d, nof_cb=%d\n",
cb_idx[i], remaining_cb, i, first_cb, nof_cb);
q->nof_iterations += q->max_iterations;
srslte_tdec_reset_cb(&q->decoder, i);
remaining_cb--;
decoder_input[i] = NULL;
cb_idx[i] = 0;
}
} }
} while (cb_noi < q->max_iterations && !early_stop);
INFO("CB %d: rp=%d, n_e=%d, cb_len=%d, CRC=%s, rlen=%d, iterations=%d/%d\n",
cb_idx, rp, n_e2, cb_len, early_stop?"OK":"KO", rlen, cb_noi, q->max_iterations);
} else {
// Copy decoded data from previous transmissions
uint32_t cb_len = cb_idx<cb_segm->C1?cb_segm->K1:cb_segm->K2;
uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24);
memcpy(&data[cb_idx*rlen/8], softbuffer->data[cb_idx], rlen/8 * sizeof(uint8_t));
} }
} }
softbuffer->tb_crc = true; softbuffer->tb_crc = true;
for (int i = 0; i < nof_cb && softbuffer->tb_crc; i++) { for (int i = 0; i < cb_segm->C && softbuffer->tb_crc; i++) {
/* If one CB failed return false */ /* If one CB failed return false */
softbuffer->tb_crc = softbuffer->cb_crc[i]; softbuffer->tb_crc = softbuffer->cb_crc[i];
} }
if (softbuffer->tb_crc) { // If TB CRC failed, save correct CB for next retransmission
for (int i = 0; i < nof_cb; i++) { if (!softbuffer->tb_crc) {
memcpy(&data[i * rlen / 8], softbuffer->data[i], rlen/8 * sizeof(uint8_t)); for (int i = 0; i < cb_segm->C; i++) {
if (softbuffer->cb_crc[i]) {
uint32_t cb_len = i<cb_segm->C1?cb_segm->K1:cb_segm->K2;
uint32_t rlen = cb_segm->C==1?cb_len:(cb_len-24);
memcpy(softbuffer->data[i], &data[i * rlen / 8], rlen/8 * sizeof(uint8_t));
}
} }
} }
q->nof_iterations /= nof_cb; q->nof_iterations /= cb_segm->C;
return softbuffer->tb_crc; return softbuffer->tb_crc;
} }
@ -485,16 +463,12 @@ static int decode_tb(srslte_sch_t *q,
bool crc_ok = true; bool crc_ok = true;
uint32_t nof_cb_groups = cb_segm->C2>0?2:1;
data[cb_segm->tbs/8+0] = 0; data[cb_segm->tbs/8+0] = 0;
data[cb_segm->tbs/8+1] = 0; data[cb_segm->tbs/8+1] = 0;
data[cb_segm->tbs/8+2] = 0; data[cb_segm->tbs/8+2] = 0;
// Process Codeblocks in groups of equal CB size to parallelize according to SRSLTE_TDEC_MAX_NPAR // Process Codeblocks
for (uint32_t i=0;i<nof_cb_groups && crc_ok;i++) { crc_ok = decode_tb_cb(q, softbuffer, cb_segm, Qm, rv, nof_e_bits, e_bits, data);
crc_ok = decode_tb_cb(q, softbuffer, cb_segm, Qm, rv, nof_e_bits, e_bits, data, i);
}
if (crc_ok) { if (crc_ok) {
@ -593,22 +567,286 @@ static void ulsch_interleave_gen(uint32_t H_prime_total, uint32_t N_pusch_symbs,
} }
} }
static void ulsch_interleave_qm2(const uint8_t *g_bits, uint32_t rows, uint32_t cols, uint8_t *q_bits, uint32_t ri_min_row, const uint8_t *ri_present) {
uint32_t bit_read_idx = 0;
for (uint32_t j = 0; j < ri_min_row; j++) {
for (uint32_t i = 0; i < cols; i++) {
uint32_t k = (i * rows + j) * 2;
uint32_t read_byte_idx = bit_read_idx / 8;
uint32_t read_bit_idx = bit_read_idx % 8;
uint32_t write_byte_idx = k / 8;
uint32_t write_bit_idx = k % 8;
uint8_t w = (g_bits[read_byte_idx] >> (6 - read_bit_idx)) & (uint8_t) 0x03;
q_bits[write_byte_idx] |= w << (6 - write_bit_idx);
bit_read_idx += 2;
}
}
for (uint32_t j = ri_min_row; j < rows; j++) {
for (uint32_t i = 0; i < cols; i++) {
uint32_t k = (i * rows + j) * 2;
if (ri_present[k]) {
/* do nothing */
} else {
uint32_t read_byte_idx = bit_read_idx / 8;
uint32_t read_bit_idx = bit_read_idx % 8;
uint32_t write_byte_idx = k / 8;
uint32_t write_bit_idx = k % 8;
uint8_t w = (g_bits[read_byte_idx] >> (6 - read_bit_idx)) & (uint8_t) 0x03;
q_bits[write_byte_idx] |= w << (6 - write_bit_idx);
bit_read_idx += 2;
}
}
}
}
static void ulsch_interleave_qm4(uint8_t *g_bits, uint32_t rows, uint32_t cols, uint8_t *q_bits, uint32_t ri_min_row, const uint8_t *ri_present) {
uint32_t bit_read_idx = 0;
for (uint32_t j = 0; j < ri_min_row; j++) {
int32_t i = 0;
#ifndef LV_HAVE_SSE
__m128i _counter = _mm_slli_epi32(_mm_add_epi32(_mm_mullo_epi32(_counter0,_rows),_mm_set1_epi32(j)), 2);
uint8_t *_g_bits = &g_bits[bit_read_idx/8];
/* First bits are aligned to byte */
if (0 == (bit_read_idx & 0x3)) {
for (; i < (cols - 3); i += 4) {
uint8_t w1 = *(_g_bits++);
uint8_t w2 = *(_g_bits++);
__m128i _write_byte_idx = _mm_srli_epi32(_counter, 3);
__m128i _write_bit_idx = _mm_and_si128(_counter, _7);
__m128i _write_shift = _mm_sub_epi32(_4, _write_bit_idx);
q_bits[_mm_extract_epi32(_write_byte_idx, 0)] |= (w1 >> 0x4) << _mm_extract_epi32(_write_shift, 0);
q_bits[_mm_extract_epi32(_write_byte_idx, 1)] |= (w1 & 0xf) << _mm_extract_epi32(_write_shift, 1);
q_bits[_mm_extract_epi32(_write_byte_idx, 2)] |= (w2 >> 0x4) << _mm_extract_epi32(_write_shift, 2);
q_bits[_mm_extract_epi32(_write_byte_idx, 3)] |= (w2 & 0xf) << _mm_extract_epi32(_write_shift, 3);
_counter = _mm_add_epi32(_counter, _inc);
}
} else {
for (; i < (cols - 3); i += 4) {
__m128i _write_byte_idx = _mm_srli_epi32(_counter, 3);
__m128i _write_bit_idx = _mm_and_si128(_counter, _7);
__m128i _write_shift = _mm_sub_epi32(_4, _write_bit_idx);
uint8_t w1 = *(_g_bits);
uint8_t w2 = *(_g_bits++);
uint8_t w3 = *(_g_bits++);
q_bits[_mm_extract_epi32(_write_byte_idx, 0)] |= (w1 & 0xf) << _mm_extract_epi32(_write_shift, 0);
q_bits[_mm_extract_epi32(_write_byte_idx, 1)] |= (w2 >> 0x4) << _mm_extract_epi32(_write_shift, 1);
q_bits[_mm_extract_epi32(_write_byte_idx, 2)] |= (w2 & 0xf) << _mm_extract_epi32(_write_shift, 2);
q_bits[_mm_extract_epi32(_write_byte_idx, 3)] |= (w3 >> 0x4) << _mm_extract_epi32(_write_shift, 3);
_counter = _mm_add_epi32(_counter, _inc);
}
}
bit_read_idx += i * 4;
#endif /* LV_HAVE_SSE */
/* Spare bits */
for (; i < cols; i++) {
uint32_t k = (i * rows + j) * 4;
uint32_t read_byte_idx = bit_read_idx / 8;
uint32_t read_bit_idx = bit_read_idx % 8;
uint32_t write_byte_idx = k / 8;
uint32_t write_bit_idx = k % 8;
uint8_t w = (g_bits[read_byte_idx] >> (4 - read_bit_idx)) & (uint8_t) 0x0f;
q_bits[write_byte_idx] |= w << (4 - write_bit_idx);
bit_read_idx += 4;
}
}
/* Do rows containing RI */
for (uint32_t j = ri_min_row; j < rows; j++) {
for (uint32_t i = 0; i < cols; i++) {
uint32_t k = (i * rows + j) * 4;
if (ri_present[k]) {
/* do nothing */
} else {
uint32_t read_byte_idx = bit_read_idx / 8;
uint32_t read_bit_idx = bit_read_idx % 8;
uint32_t write_byte_idx = k / 8;
uint32_t write_bit_idx = k % 8;
uint8_t w = (g_bits[read_byte_idx] >> (4 - read_bit_idx)) & (uint8_t) 0x0f;
q_bits[write_byte_idx] |= w << (4 - write_bit_idx);
bit_read_idx += 4;
}
}
}
}
static void ulsch_interleave_qm6(const uint8_t *g_bits,
uint32_t rows,
uint32_t cols,
uint8_t *q_bits,
uint32_t ri_min_row,
const uint8_t *ri_present) {
uint32_t bit_read_idx = 0;
for (uint32_t j = 0; j < ri_min_row; j++) {
for (uint32_t i = 0; i < cols; i++) {
uint32_t k = (i * rows + j) * 6;
uint32_t read_byte_idx = bit_read_idx / 8;
uint32_t read_bit_idx = bit_read_idx % 8;
uint32_t write_byte_idx = k / 8;
uint32_t write_bit_idx = k % 8;
uint8_t w;
switch (read_bit_idx) {
case 0:
w = g_bits[read_byte_idx] >> 2;
break;
case 2:
w = g_bits[read_byte_idx] & (uint8_t) 0x3f;
break;
case 4:
w = ((g_bits[read_byte_idx] << 2) | (g_bits[read_byte_idx + 1] >> 6)) & (uint8_t) 0x3f;
break;
case 6:
w = ((g_bits[read_byte_idx] << 4) | (g_bits[read_byte_idx + 1] >> 4)) & (uint8_t) 0x3f;
break;
default:
w = 0;
}
switch (write_bit_idx) {
case 0:
q_bits[write_byte_idx] |= w << 2;
break;
case 2:
q_bits[write_byte_idx] |= w;
break;
case 4:
q_bits[write_byte_idx] |= w >> 2;
q_bits[write_byte_idx + 1] |= w << 6;
break;
case 6:
q_bits[write_byte_idx] |= w >> 4;
q_bits[write_byte_idx + 1] |= w << 4;
break;
default:
/* Do nothing */;
}
bit_read_idx += 6;
}
}
for (uint32_t j = ri_min_row; j < rows; j++) {
for (uint32_t i = 0; i < cols; i++) {
uint32_t k = (i * rows + j) * 6;
if (ri_present[k]) {
/* do nothing */
} else {
uint32_t read_byte_idx = bit_read_idx / 8;
uint32_t read_bit_idx = bit_read_idx % 8;
uint32_t write_byte_idx = k / 8;
uint32_t write_bit_idx = k % 8;
uint8_t w;
switch (read_bit_idx) {
case 0:
w = g_bits[read_byte_idx] >> 2;
break;
case 2:
w = g_bits[read_byte_idx] & (uint8_t) 0x3f;
break;
case 4:
w = ((g_bits[read_byte_idx] << 2) | (g_bits[read_byte_idx + 1] >> 6)) & (uint8_t) 0x3f;
break;
case 6:
w = ((g_bits[read_byte_idx] << 4) | (g_bits[read_byte_idx + 1] >> 4)) & (uint8_t) 0x3f;
break;
default:
w = 0;
}
switch (write_bit_idx) {
case 0:
q_bits[write_byte_idx] |= w << 2;
break;
case 2:
q_bits[write_byte_idx] |= w;
break;
case 4:
q_bits[write_byte_idx] |= w >> 2;
q_bits[write_byte_idx + 1] |= w << 6;
break;
case 6:
q_bits[write_byte_idx] |= w >> 4;
q_bits[write_byte_idx + 1] |= w << 4;
break;
default:
/* Do nothing */;
}
bit_read_idx += 6;
}
}
}
}
/* UL-SCH channel interleaver according to 5.2.2.8 of 36.212 */ /* UL-SCH channel interleaver according to 5.2.2.8 of 36.212 */
void ulsch_interleave(uint8_t *g_bits, uint32_t Qm, uint32_t H_prime_total, void ulsch_interleave(uint8_t *g_bits, uint32_t Qm, uint32_t H_prime_total,
uint32_t N_pusch_symbs, uint8_t *q_bits, srslte_uci_bit_t *ri_bits, uint32_t nof_ri_bits, uint32_t N_pusch_symbs, uint8_t *q_bits, srslte_uci_bit_t *ri_bits, uint32_t nof_ri_bits,
uint8_t *ri_present, uint32_t *inteleaver_lut) uint8_t *ri_present, uint32_t *inteleaver_lut)
{ {
const uint32_t nof_bits = H_prime_total * Qm;
uint32_t rows = H_prime_total / N_pusch_symbs;
uint32_t cols = N_pusch_symbs;
uint32_t ri_min_row = rows;
// Prepare ri_bits for fast search using temp_buffer // Prepare ri_bits for fast search using temp_buffer
if (nof_ri_bits > 0) { if (nof_ri_bits > 0) {
for (uint32_t i=0;i<nof_ri_bits;i++) { for (uint32_t i=0;i<nof_ri_bits;i++) {
uint32_t ri_row = (ri_bits[i].position / Qm) % rows;
if (ri_row < ri_min_row) {
ri_min_row = ri_row;
}
ri_present[ri_bits[i].position] = 1; ri_present[ri_bits[i].position] = 1;
} }
} }
#if 1
bzero(q_bits, nof_bits / 8);
switch (Qm) {
case 2:
ulsch_interleave_qm2(g_bits, rows, cols, q_bits, ri_min_row, ri_present);
break;
case 4:
ulsch_interleave_qm4(g_bits, rows, cols, q_bits, ri_min_row, ri_present);
break;
case 6:
ulsch_interleave_qm6(g_bits, rows, cols, q_bits, ri_min_row, ri_present);
break;
default:
/* This line should never be reached */
fprintf(stderr, "Wrong Qm (%d)\n", Qm);
}
#else
// Genearate interleaver table and interleave bits // Genearate interleaver table and interleave bits
ulsch_interleave_gen(H_prime_total, N_pusch_symbs, Qm, ri_present, inteleaver_lut); ulsch_interleave_gen(H_prime_total, N_pusch_symbs, Qm, ri_present, inteleaver_lut);
srslte_bit_interleave_i(g_bits, q_bits, inteleaver_lut, H_prime_total*Qm); srslte_bit_interleave_i(g_bits, q_bits, inteleaver_lut, H_prime_total*Qm);
#endif
// Reset temp_buffer because will be reused next time // Reset temp_buffer because will be reused next time
if (nof_ri_bits > 0) { if (nof_ri_bits > 0) {

@ -63,18 +63,23 @@ bool tb_cw_swap = false;
bool enable_coworker = false; bool enable_coworker = false;
uint32_t pmi = 0; uint32_t pmi = 0;
char *input_file = NULL; char *input_file = NULL;
int M=1;
bool use_8_bit = false;
void usage(char *prog) { void usage(char *prog) {
printf("Usage: %s [fmMcsrtRFpnwav] \n", prog); printf("Usage: %s [fmMbcsrtRFpnwav] \n", prog);
printf("\t-f read signal from file [Default generate it with pdsch_encode()]\n"); printf("\t-f read signal from file [Default generate it with pdsch_encode()]\n");
printf("\t-m MCS [Default %d]\n", mcs[0]); printf("\t-m MCS [Default %d]\n", mcs[0]);
printf("\t-M MCS2 [Default %d]\n", mcs[1]); printf("\t-M MCS2 [Default %d]\n", mcs[1]);
printf("\t-c cell id [Default %d]\n", cell.id); printf("\t-c cell id [Default %d]\n", cell.id);
printf("\t-b Use 8-bit LLR [Default 16-bit]\n");
printf("\t-s subframe [Default %d]\n", subframe); printf("\t-s subframe [Default %d]\n", subframe);
printf("\t-r rv_idx [Default %d]\n", rv_idx[0]); printf("\t-r rv_idx [Default %d]\n", rv_idx[0]);
printf("\t-t rv_idx2 [Default %d]\n", rv_idx[1]); printf("\t-t rv_idx2 [Default %d]\n", rv_idx[1]);
printf("\t-R rnti [Default %d]\n", rnti); printf("\t-R rnti [Default %d]\n", rnti);
printf("\t-F cfi [Default %d]\n", cfi); printf("\t-F cfi [Default %d]\n", cfi);
printf("\t-X Number of repetitions for time measurement [Default %d]\n", M);
printf("\t-x Transmission mode [single|diversity|cdd|multiplex] [Default %s]\n", mimo_type_str); printf("\t-x Transmission mode [single|diversity|cdd|multiplex] [Default %s]\n", mimo_type_str);
printf("\t-n cell.nof_prb [Default %d]\n", cell.nof_prb); printf("\t-n cell.nof_prb [Default %d]\n", cell.nof_prb);
printf("\t-a nof_rx_antennas [Default %d]\n", nof_rx_antennas); printf("\t-a nof_rx_antennas [Default %d]\n", nof_rx_antennas);
@ -86,7 +91,7 @@ void usage(char *prog) {
void parse_args(int argc, char **argv) { void parse_args(int argc, char **argv) {
int opt; int opt;
while ((opt = getopt(argc, argv, "fmMcsrtRFpnawvxj")) != -1) { while ((opt = getopt(argc, argv, "fmMcsbrtRFpnawvXxj")) != -1) {
switch(opt) { switch(opt) {
case 'f': case 'f':
input_file = argv[optind]; input_file = argv[optind];
@ -94,12 +99,18 @@ void parse_args(int argc, char **argv) {
case 'm': case 'm':
mcs[0] = (uint32_t) atoi(argv[optind]); mcs[0] = (uint32_t) atoi(argv[optind]);
break; break;
case 'b':
use_8_bit = true;
break;
case 'M': case 'M':
mcs[1] = (uint32_t) atoi(argv[optind]); mcs[1] = (uint32_t) atoi(argv[optind]);
break; break;
case 's': case 's':
subframe = atoi(argv[optind]); subframe = atoi(argv[optind]);
break; break;
case 'X':
M = (uint32_t) atoi(argv[optind]);
break;
case 'r': case 'r':
rv_idx[0] = (uint32_t) atoi(argv[optind]); rv_idx[0] = (uint32_t) atoi(argv[optind]);
break; break;
@ -166,7 +177,6 @@ int main(int argc, char **argv) {
int ret = -1; int ret = -1;
struct timeval t[3]; struct timeval t[3];
srslte_softbuffer_tx_t *softbuffers_tx[SRSLTE_MAX_CODEWORDS]; srslte_softbuffer_tx_t *softbuffers_tx[SRSLTE_MAX_CODEWORDS];
int M=1;
bool acks[SRSLTE_MAX_CODEWORDS] = {false}; bool acks[SRSLTE_MAX_CODEWORDS] = {false};
parse_args(argc,argv); parse_args(argc,argv);
@ -319,6 +329,9 @@ int main(int argc, char **argv) {
goto quit; goto quit;
} }
pdsch_rx.llr_is_8bit = use_8_bit;
pdsch_rx.dl_sch.llr_is_8bit = use_8_bit;
srslte_pdsch_set_rnti(&pdsch_rx, rnti); srslte_pdsch_set_rnti(&pdsch_rx, rnti);
for (i = 0; i < SRSLTE_MAX_CODEWORDS; i++) { for (i = 0; i < SRSLTE_MAX_CODEWORDS; i++) {
@ -517,8 +530,8 @@ int main(int argc, char **argv) {
for (int byte = 0; byte < grant.mcs[tb].tbs / 8; byte++) { for (int byte = 0; byte < grant.mcs[tb].tbs / 8; byte++) {
if (data_tx[tb][byte] != data_rx[tb][byte]) { if (data_tx[tb][byte] != data_rx[tb][byte]) {
ERROR("Found BYTE (%d) error in TB %d (%02X != %02X), quiting...", byte, tb, data_tx[tb][byte], data_rx[tb][byte]); ERROR("Found BYTE (%d) error in TB %d (%02X != %02X), quiting...", byte, tb, data_tx[tb][byte], data_rx[tb][byte]);
printf("Tx: "); srslte_vec_fprint_byte(stdout, data_tx[tb], grant.mcs[tb].tbs / 8); //printf("Tx: "); srslte_vec_fprint_byte(stdout, data_tx[tb], grant.mcs[tb].tbs / 8);
printf("Rx: "); srslte_vec_fprint_byte(stdout, data_rx[tb], grant.mcs[tb].tbs / 8); //printf("Rx: "); srslte_vec_fprint_byte(stdout, data_rx[tb], grant.mcs[tb].tbs / 8);
ret = SRSLTE_ERROR; ret = SRSLTE_ERROR;
goto quit; goto quit;
} }

@ -47,7 +47,12 @@ void srslte_scrambling_s(srslte_sequence_t *s, short *data) {
void srslte_scrambling_s_offset(srslte_sequence_t *s, short *data, int offset, int len) { void srslte_scrambling_s_offset(srslte_sequence_t *s, short *data, int offset, int len) {
assert (len + offset <= s->cur_len); assert (len + offset <= s->cur_len);
srslte_vec_prod_sss(data, &s->c_short[offset], data, len); srslte_vec_neg_sss(data, &s->c_short[offset], data, len);
}
void srslte_scrambling_sb_offset(srslte_sequence_t *s, int8_t *data, int offset, int len) {
assert (len + offset <= s->cur_len);
srslte_vec_neg_bbb(data, &s->c_char[offset], data, len);
} }
void srslte_scrambling_c(srslte_sequence_t *s, cf_t *data) { void srslte_scrambling_c(srslte_sequence_t *s, cf_t *data) {

@ -177,6 +177,83 @@ int main(int argc, char **argv) {
free(input_f); free(input_f);
free(scrambled_f); free(scrambled_f);
int16_t *input_s, *scrambled_s;
// Scramble also shorts
input_s= malloc(sizeof(int16_t) * seq.cur_len);
if (!input_s) {
perror("malloc");
exit(-1);
}
scrambled_s = malloc(sizeof(int16_t) * seq.cur_len);
if (!scrambled_s) {
perror("malloc");
exit(-1);
}
for (i=0;i<seq.cur_len;i++) {
input_s[i] = 100*(rand()/RAND_MAX)-50;
scrambled_s[i] = input_s[i];
}
gettimeofday(&t[1], NULL);
srslte_scrambling_s(&seq, scrambled_s);
gettimeofday(&t[2], NULL);
srslte_scrambling_s(&seq, scrambled_s);
get_time_interval(t);
printf("Texec short=%ld us for %d bits\n", t[0].tv_usec, seq.cur_len);
for (i=0;i<seq.cur_len;i++) {
if (scrambled_s[i] != input_s[i]) {
printf("Error in %d\n", i);
exit(-1);
}
}
free(input_s);
free(scrambled_s);
int8_t *input_b, *scrambled_b;
// Scramble also bytes
input_b= malloc(sizeof(int8_t) * seq.cur_len);
if (!input_b) {
perror("malloc");
exit(-1);
}
scrambled_b = malloc(sizeof(int8_t) * seq.cur_len);
if (!scrambled_b) {
perror("malloc");
exit(-1);
}
for (i=0;i<seq.cur_len;i++) {
input_b[i] = 100*(rand()/RAND_MAX)-50;
scrambled_b[i] = input_b[i];
}
gettimeofday(&t[1], NULL);
srslte_scrambling_sb_offset(&seq, scrambled_b, 0, seq.cur_len);
gettimeofday(&t[2], NULL);
srslte_scrambling_sb_offset(&seq, scrambled_b, 0, seq.cur_len);
get_time_interval(t);
printf("Texec char=%ld us for %d bits\n", t[0].tv_usec, seq.cur_len);
for (i=0;i<seq.cur_len;i++) {
if (scrambled_b[i] != input_b[i]) {
printf("Error in %d\n", i);
exit(-1);
}
}
free(input_b);
free(scrambled_b);
} }
printf("Ok\n"); printf("Ok\n");
srslte_sequence_free(&seq); srslte_sequence_free(&seq);

@ -58,6 +58,10 @@ void srslte_vec_sub_sss(const int16_t *x, const int16_t *y, int16_t *z, const ui
srslte_vec_sub_sss_simd(x, y, z, len); srslte_vec_sub_sss_simd(x, y, z, len);
} }
void srslte_vec_sub_bbb(const int8_t *x, const int8_t *y, int8_t *z, const uint32_t len) {
srslte_vec_sub_bbb_simd(x, y, z, len);
}
// Noise estimation in chest_dl, interpolation // Noise estimation in chest_dl, interpolation
void srslte_vec_sub_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len) { void srslte_vec_sub_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len) {
return srslte_vec_sub_fff((const float*) x,(const float*) y,(float*) z, 2*len); return srslte_vec_sub_fff((const float*) x,(const float*) y,(float*) z, 2*len);
@ -100,10 +104,18 @@ void srslte_vec_convert_fi(const float *x, const float scale, int16_t *z, const
srslte_vec_convert_fi_simd(x, z, scale, len); srslte_vec_convert_fi_simd(x, z, scale, len);
} }
void srslte_vec_convert_fb(const float *x, const float scale, int8_t *z, const uint32_t len) {
srslte_vec_convert_fb_simd(x, z, scale, len);
}
void srslte_vec_lut_sss(const short *x, const unsigned short *lut, short *y, const uint32_t len) { void srslte_vec_lut_sss(const short *x, const unsigned short *lut, short *y, const uint32_t len) {
srslte_vec_lut_sss_simd(x, lut, y, len); srslte_vec_lut_sss_simd(x, lut, y, len);
} }
void srslte_vec_lut_bbb(const int8_t *x, const unsigned short *lut, int8_t *y, const uint32_t len) {
srslte_vec_lut_bbb_simd(x, lut, y, len);
}
void srslte_vec_lut_sis(const short *x, const unsigned int *lut, short *y, const uint32_t len) { void srslte_vec_lut_sis(const short *x, const unsigned int *lut, short *y, const uint32_t len) {
for (int i=0; i < len; i++) { for (int i=0; i < len; i++) {
y[lut[i]] = x[i]; y[lut[i]] = x[i];
@ -163,6 +175,15 @@ void srslte_vec_fprint_b(FILE *stream, uint8_t *x, const uint32_t len) {
fprintf(stream, "];\n"); fprintf(stream, "];\n");
} }
void srslte_vec_fprint_bs(FILE *stream, int8_t *x, const uint32_t len) {
int i;
fprintf(stream, "[");
for (i=0;i<len;i++) {
fprintf(stream, "%4d, ", x[i]);
}
fprintf(stream, "];\n");
}
void srslte_vec_fprint_byte(FILE *stream, uint8_t *x, const uint32_t len) { void srslte_vec_fprint_byte(FILE *stream, uint8_t *x, const uint32_t len) {
int i; int i;
fprintf(stream, "["); fprintf(stream, "[");
@ -185,7 +206,7 @@ void srslte_vec_fprint_s(FILE *stream, short *x, const uint32_t len) {
int i; int i;
fprintf(stream, "["); fprintf(stream, "[");
for (i=0;i<len;i++) { for (i=0;i<len;i++) {
fprintf(stream, "%d, ", x[i]); fprintf(stream, "%4d, ", x[i]);
} }
fprintf(stream, "];\n"); fprintf(stream, "];\n");
} }
@ -271,11 +292,18 @@ void srslte_vec_prod_fff(const float *x, const float *y, float *z, const uint32_
srslte_vec_prod_fff_simd(x, y, z, len); srslte_vec_prod_fff_simd(x, y, z, len);
} }
// Scrambling Short
void srslte_vec_prod_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len) { void srslte_vec_prod_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len) {
srslte_vec_prod_sss_simd(x,y,z,len); srslte_vec_prod_sss_simd(x,y,z,len);
} }
// Scrambling
void srslte_vec_neg_sss(const int16_t *x, const int16_t *y, int16_t *z, const uint32_t len) {
srslte_vec_neg_sss_simd(x,y,z,len);
}
void srslte_vec_neg_bbb(const int8_t *x, const int8_t *y, int8_t *z, const uint32_t len) {
srslte_vec_neg_bbb_simd(x,y,z,len);
}
// CFO and OFDM processing // CFO and OFDM processing
void srslte_vec_prod_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len) { void srslte_vec_prod_ccc(const cf_t *x, const cf_t *y, cf_t *z, const uint32_t len) {
srslte_vec_prod_ccc_simd(x,y,z,len); srslte_vec_prod_ccc_simd(x,y,z,len);

@ -162,6 +162,35 @@ void srslte_vec_sub_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, con
} }
} }
void srslte_vec_sub_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, const int len) {
int i = 0;
#if SRSLTE_SIMD_B_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_B_SIZE + 1; i += SRSLTE_SIMD_B_SIZE) {
simd_s_t a = srslte_simd_b_load(&x[i]);
simd_s_t b = srslte_simd_b_load(&y[i]);
simd_s_t r = srslte_simd_b_sub(a, b);
srslte_simd_b_store(&z[i], r);
}
} else {
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
simd_s_t a = srslte_simd_b_loadu(&x[i]);
simd_s_t b = srslte_simd_b_loadu(&y[i]);
simd_s_t r = srslte_simd_b_sub(a, b);
srslte_simd_b_storeu(&z[i], r);
}
}
#endif /* SRSLTE_SIMD_S_SIZE */
for(; i < len; i++){
z[i] = x[i] - y[i];
}
}
void srslte_vec_prod_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, const int len) { void srslte_vec_prod_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, const int len) {
int i = 0; int i = 0;
#if SRSLTE_SIMD_S_SIZE #if SRSLTE_SIMD_S_SIZE
@ -191,12 +220,69 @@ void srslte_vec_prod_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, co
} }
} }
void srslte_vec_neg_sss_simd(const int16_t *x, const int16_t *y, int16_t *z, const int len) {
int i = 0;
#if SRSLTE_SIMD_S_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
simd_s_t a = srslte_simd_s_load(&x[i]);
simd_s_t b = srslte_simd_s_load(&y[i]);
simd_s_t r = srslte_simd_s_neg(a, b);
srslte_simd_s_store(&z[i], r);
}
} else {
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
simd_s_t a = srslte_simd_s_loadu(&x[i]);
simd_s_t b = srslte_simd_s_loadu(&y[i]);
simd_s_t r = srslte_simd_s_neg(a, b);
srslte_simd_s_storeu(&z[i], r);
}
}
#endif /* SRSLTE_SIMD_S_SIZE */
for(; i < len; i++){
z[i] = y[i]<0?-x[i]:x[i];
}
}
void srslte_vec_neg_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, const int len) {
int i = 0;
#if SRSLTE_SIMD_B_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_B_SIZE + 1; i += SRSLTE_SIMD_B_SIZE) {
simd_s_t a = srslte_simd_b_load(&x[i]);
simd_s_t b = srslte_simd_b_load(&y[i]);
simd_s_t r = srslte_simd_b_neg(a, b);
srslte_simd_b_store(&z[i], r);
}
} else {
for (; i < len - SRSLTE_SIMD_B_SIZE + 1; i += SRSLTE_SIMD_B_SIZE) {
simd_s_t a = srslte_simd_b_loadu(&x[i]);
simd_s_t b = srslte_simd_b_loadu(&y[i]);
simd_s_t r = srslte_simd_b_neg(a, b);
srslte_simd_b_storeu(&z[i], r);
}
}
#endif /* SRSLTE_SIMD_S_SIZE */
for(; i < len; i++){
z[i] = y[i]<0?-x[i]:x[i];
}
}
/* No improvement with AVX */ /* No improvement with AVX */
void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len) { void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len) {
int i = 0; int i = 0;
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#if CMAKE_BUILD_TYPE!=Debug #ifndef DEBUG_MODE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) { if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
for (; i < len - 7; i += 8) { for (; i < len - 7; i += 8) {
__m128i xVal = _mm_load_si128((__m128i *) &x[i]); __m128i xVal = _mm_load_si128((__m128i *) &x[i]);
@ -228,6 +314,53 @@ void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y
} }
} }
void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t *y, const int len) {
int i = 0;
#ifdef LV_HAVE_SSE
#ifndef DEBUG_MODE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
for (; i < len - 15; i += 16) {
__m128i xVal = _mm_load_si128((__m128i *) &x[i]);
__m128i lutVal1 = _mm_load_si128((__m128i *) &lut[i]);
__m128i lutVal2 = _mm_load_si128((__m128i *) &lut[i+8]);
for (int k = 0; k < 8; k++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, k);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k);
y[l] = (char) x;
}
for (int k = 0; k < 8; k++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k);
y[l] = (char) x;
}
}
} else {
for (; i < len - 15; i += 16) {
__m128i xVal = _mm_loadu_si128((__m128i *) &x[i]);
__m128i lutVal1 = _mm_loadu_si128((__m128i *) &lut[i]);
__m128i lutVal2 = _mm_loadu_si128((__m128i *) &lut[i+8]);
for (int k = 0; k < 8; k++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, k);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k);
y[l] = (char) x;
}
for (int k = 0; k < 8; k++) {
int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k);
y[l] = (char) x;
}
}
}
#endif
#endif
for (; i < len; i++) {
y[lut[i]] = x[i];
}
}
void srslte_vec_convert_if_simd(const int16_t *x, float *z, const float scale, const int len) { void srslte_vec_convert_if_simd(const int16_t *x, float *z, const float scale, const int len) {
int i = 0; int i = 0;
const float gain = 1.0f / scale; const float gain = 1.0f / scale;
@ -295,6 +428,73 @@ void srslte_vec_convert_fi_simd(const float *x, int16_t *z, const float scale, c
} }
} }
#define SRSLTE_IS_ALIGNED_SSE(PTR) (((size_t)(PTR) & 0x0F) == 0)
void srslte_vec_convert_fb_simd(const float *x, int8_t *z, const float scale, const int len) {
int i = 0;
// Force the use of SSE here instead of AVX since the implementations requires too many permutes across 128-bit boundaries
#ifdef LV_HAVE_SSE
__m128 s = _mm_set1_ps(scale);
if (SRSLTE_IS_ALIGNED_SSE(x) && SRSLTE_IS_ALIGNED_SSE(z)) {
for (; i < len - 16 + 1; i += 16) {
__m128 a = _mm_load_ps(&x[i]);
__m128 b = _mm_load_ps(&x[i + 1*4]);
__m128 c = _mm_load_ps(&x[i + 2*4]);
__m128 d = _mm_load_ps(&x[i + 3*4]);
__m128 sa = _mm_mul_ps(a, s);
__m128 sb = _mm_mul_ps(b, s);
__m128 sc = _mm_mul_ps(c, s);
__m128 sd = _mm_mul_ps(d, s);
__m128i ai = _mm_cvttps_epi32(sa);
__m128i bi = _mm_cvttps_epi32(sb);
__m128i ci = _mm_cvttps_epi32(sc);
__m128i di = _mm_cvttps_epi32(sd);
__m128i ab = _mm_packs_epi32(ai, bi);
__m128i cd = _mm_packs_epi32(ci, di);
__m128i i8 =_mm_packs_epi16(ab, cd);
_mm_store_si128((__m128i*)&z[i], i8);
}
} else {
for (; i < len - 16 + 1; i += 16) {
__m128 a = _mm_load_ps(&x[i]);
__m128 b = _mm_load_ps(&x[i + 1*4]);
__m128 c = _mm_load_ps(&x[i + 2*4]);
__m128 d = _mm_load_ps(&x[i + 3*4]);
__m128 sa = _mm_mul_ps(a, s);
__m128 sb = _mm_mul_ps(b, s);
__m128 sc = _mm_mul_ps(c, s);
__m128 sd = _mm_mul_ps(d, s);
__m128i ai = _mm_cvttps_epi32(sa);
__m128i bi = _mm_cvttps_epi32(sb);
__m128i ci = _mm_cvttps_epi32(sc);
__m128i di = _mm_cvttps_epi32(sd);
__m128i ab = _mm_packs_epi32(ai, bi);
__m128i cd = _mm_packs_epi32(ci, di);
__m128i i8 =_mm_packs_epi16(ab, cd);
_mm_storeu_si128((__m128i*)&z[i], i8);
}
}
#endif
#ifdef HAVE_NEON
#warning srslte_vec_convert_fb_simd not implemented in neon
#endif /* HAVE_NEON */
for(; i < len; i++){
z[i] = (int8_t) (x[i] * scale);
}
}
float srslte_vec_acc_ff_simd(const float *x, const int len) { float srslte_vec_acc_ff_simd(const float *x, const int len) {
int i = 0; int i = 0;
float acc_sum = 0.0f; float acc_sum = 0.0f;

@ -141,7 +141,8 @@ nof_ctrl_symbols = 3
##################################################################### #####################################################################
# Expert configuration options # Expert configuration options
# #
# pdsch_max_its: Maximum number of turbo decoder iterations (Default 4) # pusch_max_its: Maximum number of turbo decoder iterations (Default 4)
# pusch_8bit_decoder: Use 8-bit for LLR representation and turbo decoder trellis computation (Experimental)
# nof_phy_threads: Selects the number of PHY threads (maximum 4, minimum 1, default 2) # nof_phy_threads: Selects the number of PHY threads (maximum 4, minimum 1, default 2)
# metrics_period_secs: Sets the period at which metrics are requested from the UE. # metrics_period_secs: Sets the period at which metrics are requested from the UE.
# pregenerate_signals: Pregenerate uplink signals after attach. Improves CPU performance. # pregenerate_signals: Pregenerate uplink signals after attach. Improves CPU performance.
@ -152,7 +153,8 @@ nof_ctrl_symbols = 3
# #
##################################################################### #####################################################################
[expert] [expert]
#pdsch_max_its = 4 #pusch_max_its = 8 # These are half iterations
#pusch_8bit_decoder = false
#nof_phy_threads = 2 #nof_phy_threads = 2
#pregenerate_signals = false #pregenerate_signals = false
#tx_amplitude = 0.6 #tx_amplitude = 0.6

@ -41,6 +41,7 @@ namespace srsenb {
typedef struct { typedef struct {
float max_prach_offset_us; float max_prach_offset_us;
int pusch_max_its; int pusch_max_its;
bool pusch_8bit_decoder;
float tx_amplitude; float tx_amplitude;
int nof_phy_threads; int nof_phy_threads;
std::string equalizer_mode; std::string equalizer_mode;

@ -157,9 +157,13 @@ void parse_args(all_args_t *args, int argc, char* argv[]) {
"Pregenerate uplink signals after attach. Improves CPU performance.") "Pregenerate uplink signals after attach. Improves CPU performance.")
("expert.pusch_max_its", ("expert.pusch_max_its",
bpo::value<int>(&args->expert.phy.pusch_max_its)->default_value(4), bpo::value<int>(&args->expert.phy.pusch_max_its)->default_value(8),
"Maximum number of turbo decoder iterations") "Maximum number of turbo decoder iterations")
("expert.pusch_8bit_decoder",
bpo::value<bool>(&args->expert.phy.pusch_8bit_decoder)->default_value(false),
"Use 8-bit for LLR representation and turbo decoder trellis computation (Experimental)")
("expert.tx_amplitude", ("expert.tx_amplitude",
bpo::value<float>(&args->expert.phy.tx_amplitude)->default_value(0.6), bpo::value<float>(&args->expert.phy.tx_amplitude)->default_value(0.6),
"Transmit amplitude factor") "Transmit amplitude factor")

@ -154,6 +154,10 @@ void phch_worker::init(phch_common* phy_, srslte::log *log_h_)
Info("Worker %d configured cell %d PRB\n", get_id(), phy->cell.nof_prb); Info("Worker %d configured cell %d PRB\n", get_id(), phy->cell.nof_prb);
if (phy->params.pusch_8bit_decoder) {
enb_ul.pusch.llr_is_8bit = true;
enb_ul.pusch.ul_sch.llr_is_8bit = true;
}
initiated = true; initiated = true;
running = true; running = true;

@ -214,7 +214,7 @@ void parse_args(all_args_t *args, int argc, char *argv[]) {
"Sets the noise estimation algorithm. (Default refs)") "Sets the noise estimation algorithm. (Default refs)")
("expert.pdsch_max_its", ("expert.pdsch_max_its",
bpo::value<int>(&args->expert.phy.pdsch_max_its)->default_value(4), bpo::value<int>(&args->expert.phy.pdsch_max_its)->default_value(8),
"Maximum number of turbo decoder iterations") "Maximum number of turbo decoder iterations")
("expert.attach_enable_64qam", ("expert.attach_enable_64qam",
@ -307,6 +307,10 @@ void parse_args(all_args_t *args, int argc, char *argv[]) {
bpo::value<bool>(&args->expert.phy.pdsch_csi_enabled)->default_value(true), bpo::value<bool>(&args->expert.phy.pdsch_csi_enabled)->default_value(true),
"Stores the Channel State Information and uses it for weightening the softbits. It is only used in TM1.") "Stores the Channel State Information and uses it for weightening the softbits. It is only used in TM1.")
("expert.pdsch_8bit_decoder",
bpo::value<bool>(&args->expert.phy.pdsch_8bit_decoder)->default_value(false),
"Use 8-bit for LLR representation and turbo decoder trellis computation (Experimental)")
("rf_calibration.tx_corr_dc_gain", bpo::value<float>(&args->rf_cal.tx_corr_dc_gain)->default_value(0.0), ("rf_calibration.tx_corr_dc_gain", bpo::value<float>(&args->rf_cal.tx_corr_dc_gain)->default_value(0.0),
"TX DC offset gain correction") "TX DC offset gain correction")
("rf_calibration.tx_corr_dc_phase", bpo::value<float>(&args->rf_cal.tx_corr_dc_phase)->default_value(0.0), ("rf_calibration.tx_corr_dc_phase", bpo::value<float>(&args->rf_cal.tx_corr_dc_phase)->default_value(0.0),

@ -139,6 +139,10 @@ bool phch_worker::init(uint32_t max_prb, srslte::log *log_h, srslte::log *log_ph
return false; return false;
} }
if (phy->args->pdsch_8bit_decoder) {
ue_dl.pdsch.llr_is_8bit = true;
ue_dl.pdsch.dl_sch.llr_is_8bit = true;
}
srslte_chest_dl_set_rsrp_neighbour(&ue_dl.chest, true); srslte_chest_dl_set_rsrp_neighbour(&ue_dl.chest, true);
srslte_chest_dl_average_subframe(&ue_dl.chest, phy->args->average_subframe_enabled); srslte_chest_dl_average_subframe(&ue_dl.chest, phy->args->average_subframe_enabled);

@ -204,6 +204,8 @@ enable = false
# pdsch_csi_enabled: Stores the Channel State Information and uses it for weightening the softbits. It is only # pdsch_csi_enabled: Stores the Channel State Information and uses it for weightening the softbits. It is only
# used in TM1. It is True by default. # used in TM1. It is True by default.
# #
# pdsch_8bit_decoder: Use 8-bit for LLR representation and turbo decoder trellis computation (Experimental)
#
##################################################################### #####################################################################
[expert] [expert]
#ip_netmask = 255.255.255.0 #ip_netmask = 255.255.255.0
@ -215,7 +217,7 @@ enable = false
#cqi_fixed = 10 #cqi_fixed = 10
#snr_ema_coeff = 0.1 #snr_ema_coeff = 0.1
#snr_estim_alg = refs #snr_estim_alg = refs
#pdsch_max_its = 4 #pdsch_max_its = 8 # These are half iterations
#attach_enable_64qam = false #attach_enable_64qam = false
#nof_phy_threads = 2 #nof_phy_threads = 2
#equalizer_mode = mmse #equalizer_mode = mmse
@ -234,6 +236,7 @@ enable = false
#metrics_period_secs = 1 #metrics_period_secs = 1
#metrics_csv_filename = /tmp/ue_metrics.csv #metrics_csv_filename = /tmp/ue_metrics.csv
#pdsch_csi_enabled = true #pdsch_csi_enabled = true
#pdsch_8bit_decoder = false
# CFO related values # CFO related values
#cfo_is_doppler = false #cfo_is_doppler = false

Loading…
Cancel
Save