adding windowed neon turbodecoder (can be further optimized)

master
yagoda 6 years ago
parent 77fb9c3ad5
commit 3762738bc1

@ -35,6 +35,7 @@ typedef enum SRSLTE_API {
SRSLTE_TDEC_GENERIC,
SRSLTE_TDEC_SSE,
SRSLTE_TDEC_SSE_WINDOW,
SRSLTE_TDEC_NEON_WINDOW,
SRSLTE_TDEC_AVX_WINDOW,
SRSLTE_TDEC_SSE8_WINDOW,
SRSLTE_TDEC_AVX8_WINDOW,

@ -178,9 +178,102 @@
}
#else
#if HAVE_NEON
#include <arm_neon.h>
#define WINIMP arm16
#define nof_blocks 8
#define llr_t int16_t
#define v_insert_s16(a, b, imm) \
({ \
(vsetq_lane_s16((b), (a), (imm))); \
})
#define int8x16_to_8x8x2(v) ((int8x8x2_t) {{ vget_low_s8(v), vget_high_s8(v) }})// TODO
static inline int movemask_neon(uint8x16_t movemask_low_in) {
uint8x8_t mask_and = vdup_n_u8(0x80);
int8_t __attribute__((aligned(16))) xr[8];
for(int i = 0; i <8;i++)
xr[i] = i-7;
int8x8_t mask_shift = vld1_s8(xr);
uint8x8_t lo = vget_low_u8(movemask_low_in);
uint8x8_t hi = vget_high_u8(movemask_low_in);
lo = vand_u8(lo, mask_and);
lo = vshl_u8(lo, mask_shift);
hi = vand_u8(hi, mask_and);
hi = vshl_u8(hi, mask_shift);
lo = vpadd_u8(lo, lo);
lo = vpadd_u8(lo, lo);
lo = vpadd_u8(lo, lo);
hi = vpadd_u8(hi, hi);
hi = vpadd_u8(hi, hi);
hi = vpadd_u8(hi, hi);
return ((hi[0] << 8) | (lo[0] & 0xFF));
}
inline static int16x8_t vshuff_s8(int16x8_t in, uint8x16_t mask)
{
int8x8x2_t x = int8x16_to_8x8x2((int8x16_t)in);
int8x8_t u = (int8x8_t)vget_low_u8(mask);
int8x8_t eq = vtbl2_s8(x,u);
int8x8x2_t x2 = int8x16_to_8x8x2((int8x16_t)in);
int8x8_t u2 = (int8x8_t)vget_high_u8(mask);
int8x8_t eq2 = vtbl2_s8(x2,u2);
return (int16x8_t)vcombine_s8(eq,eq2);
}
static inline int16x8_t v_packs_s16(int16x8_t a, int16x8_t b)
{
return (int16x8_t)(vcombine_s8(vqmovn_s16((a)), vqmovn_s16((b))));
}
inline static int16x8_t v_srai_s16(const int16x8_t a, const int count) {
int16x8_t b = vmovq_n_s16(-count);
return vshlq_s16(a,b);
}
inline static uint8x16_t v_load_s8(int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8, int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
{
uint8_t __attribute__((aligned(16))) data[16] = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15};
return vld1q_u8(data);
}
#define simd_type_t int16x8_t
#define simd_load(x) vld1q_s16((int16_t*)x)
#define simd_store(x,y) vst1q_s16((int16_t*)x,y)
#define simd_add vaddq_s16
#define simd_sub vsubq_s16
#define simd_max vmaxq_s16
#define simd_set1 vdupq_n_s16
#define simd_insert v_insert_s16
#define simd_shuffle vshuff_s8
#define move_right v_load_s8(15,14,15,14,13,12,11,10,9,8,7,6,5,4,3,2)
#define move_left v_load_s8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,1,0)
#define simd_rb_shift v_srai_s16
#define normalize_period 2
#define win_overlap_len 40
#define divide_output 1
#define INF 10000
#else
#error "Unknown WINIMP value"
#endif
#endif
#endif
#endif
#endif
@ -681,10 +774,20 @@ void MAKE_FUNC(extract_input)(llr_t *input, llr_t *systematic, llr_t *app2, llr_
k -= (long_cb-1);\
}\
}
#ifdef HAVE_NEON
#define insert_bit(a,b) ap = v_insert_s16(ap, app1[k+(a%b)*nof_blocks], 7-a); \
reset_cnt(a,b);
#else
#define insert_bit(a,b) ap = _mm_insert_epi16(ap, app1[k+(a%b)*nof_blocks], 7-a); \
reset_cnt(a,b); \
reset_cnt(a,b);
#endif
#ifndef HAVE_NEON
#define decide_for(b) for (uint32_t i = 0; i < long_cb/8; i++) { \
insert_bit(0,b);\
insert_bit(1,b);\
@ -696,14 +799,31 @@ void MAKE_FUNC(extract_input)(llr_t *input, llr_t *systematic, llr_t *app2, llr_
insert_bit(7,b);\
output[i] = (uint8_t) _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_packs_epi16(ap,zeros),zeros));\
}
#else
#define decide_for(b) for (uint32_t i = 0; i < long_cb/8; i++) { \
insert_bit(0,b);\
insert_bit(1,b);\
insert_bit(2,b);\
insert_bit(3,b);\
insert_bit(4,b);\
insert_bit(5,b);\
insert_bit(6,b);\
insert_bit(7,b);\
output[i] = (uint8_t) movemask_neon((uint8x16_t)vcgtq_s8((int8x16_t)v_packs_s16(ap,(int16x8_t)zeros),zeros));\
}
#endif
/* No improvement to use AVX here */
void MAKE_FUNC(decision_byte)(llr_t *app1, uint8_t *output, uint32_t long_cb)
{
uint32_t k=0;
#ifdef HAVE_NEON
int8_t z = 0;
int8x16_t zeros = vld1q_dup_s8(&z);
int16x8_t ap;
#else
__m128i zeros = _mm_setzero_si128();
__m128i ap;
#endif
if ((long_cb%(nof_blocks*8)) == 0) {
decide_for(8);
} else if ((long_cb%(nof_blocks*4)) == 0) {

@ -197,11 +197,16 @@ int main(int argc, char **argv) {
exit(-1);
}
#ifdef HAVE_NEON
tdec_type = SRSLTE_TDEC_NEON_WINDOW;
#else
// tdec_type = SRSLTE_TDEC_SSE_WINDOW;
#endif
if (srslte_tdec_init_manual(&tdec, frame_length, tdec_type)) {
fprintf(stderr, "Error initiating Turbo decoder\n");
exit(-1);
}
srslte_tdec_force_not_sb(&tdec);
float ebno_inc, esno_db;

@ -100,6 +100,22 @@ srslte_tdec_8bit_impl_t sse8_win_impl = {
};
#endif
#ifdef HAVE_NEON
//#include "srslte/phy/fec/turbodecoder_neon.h"
#define WINIMP_IS_NEON16
#include "srslte/phy/fec/turbodecoder_win.h"
#undef WINIMP_IS_NEON16
srslte_tdec_16bit_impl_t arm16_win_impl = {
tdec_winarm16_init,
tdec_winarm16_free,
tdec_winarm16_dec,
tdec_winarm16_extract_input,
tdec_winarm16_decision_byte
};
#endif
/* AVX window implementation */
#ifdef LV_HAVE_AVX2
#define WINIMP_IS_AVX8
@ -119,6 +135,8 @@ srslte_tdec_8bit_impl_t avx8_win_impl = {
#define AUTO_16_AVXWIN 2
#define AUTO_8_SSEWIN 0
#define AUTO_8_AVXWIN 1
#define AUTO_16_GEN 0
#define AUTO_16_NEONWIN 1
// Include interfaces for 8 and 16 bit decoder implementations
@ -177,10 +195,17 @@ int srslte_tdec_init_manual(srslte_tdec_t * h, uint32_t max_long_cb, srslte_tdec
h->current_llr_type = SRSLTE_TDEC_8;
break;
#endif
#ifdef HAVE_NEON
case SRSLTE_TDEC_NEON_WINDOW:
h->dec16[0] = &arm16_win_impl;
h->current_llr_type = SRSLTE_TDEC_16;
break;
#else
case SRSLTE_TDEC_GENERIC:
h->dec16[0] = &gen_impl;
h->current_llr_type = SRSLTE_TDEC_16;
break;
#endif
#ifdef LV_HAVE_AVX2
case SRSLTE_TDEC_AVX_WINDOW:
h->dec16[0] = &avx16_win_impl;
@ -241,18 +266,21 @@ int srslte_tdec_init_manual(srslte_tdec_t * h, uint32_t max_long_cb, srslte_tdec
if (dec_type == SRSLTE_TDEC_AUTO) {
#ifdef HAVE_NEON
h->dec16[0] = &gen_impl;
h->current_llr_type = SRSLTE_TDEC_16;
//h->dec8[0] = &gen_impl;
#else
h->dec16[AUTO_16_SSE] = &sse_impl;
h->dec16[AUTO_16_GEN] = &gen_impl;
h->dec16[AUTO_16_NEONWIN] = &arm16_win_impl;
#elif LV_HAVE_SSE
h->dec16[AUTO_16_SSE] = &gen_impl;
h->dec16[AUTO_16_SSEWIN] = &sse16_win_impl;
h->dec8[AUTO_8_SSEWIN] = &sse8_win_impl;
#ifdef LV_HAVE_AVX2
h->dec16[AUTO_16_AVXWIN] = &avx16_win_impl;
h->dec8[AUTO_8_AVXWIN] = &avx8_win_impl;
#endif
#else
h->dec16[AUTO_16_SSE] = &gen_impl;
h->dec16[AUTO_16_SSEWIN] = &gen_impl;
#endif /* HAVE_NEON */
for (int td=0;td<SRSLTE_TDEC_NOF_AUTO_MODES_16;td++) {
if (h->dec16[td]) {
if ((h->nof_blocks16[td] = h->dec16[td]->tdec_init(&h->dec16_hdlr[td], h->max_long_cb))<0) {

@ -1499,9 +1499,6 @@ int phch_recv::scell_recv::find_cells(cf_t *input_buffer, float rx_gain_offset,
*/
void phch_recv::meas_reset() {
if (enable_raa_searcher) {
raa_searcher->stop(-1);
}
// Stop all measurements
intra_freq_meas.clear_cells();
}

Loading…
Cancel
Save