diff --git a/lib/include/srslte/phy/fec/viterbi.h b/lib/include/srslte/phy/fec/viterbi.h index d69750fb3..689b636e9 100644 --- a/lib/include/srslte/phy/fec/viterbi.h +++ b/lib/include/srslte/phy/fec/viterbi.h @@ -57,10 +57,13 @@ typedef struct SRSLTE_API{ float gain_quant; int16_t gain_quant_s; int (*decode) (void*, uint8_t*, uint8_t*, uint32_t); + int (*decode_s) (void*, uint16_t*, uint8_t*, uint32_t); int (*decode_f) (void*, float*, uint8_t*, uint32_t); void (*free) (void*); uint8_t *tmp; + uint16_t *tmp_s; uint8_t *symbols_uc; + uint16_t *symbols_us; }srslte_viterbi_t; SRSLTE_API int srslte_viterbi_init(srslte_viterbi_t *q, @@ -87,6 +90,8 @@ SRSLTE_API int srslte_viterbi_decode_s(srslte_viterbi_t *q, uint8_t *data, uint32_t frame_length); +SRSLTE_API int srslte_viterbi_decode_us(srslte_viterbi_t *q, uint16_t *symbols, uint8_t *data, uint32_t frame_length); + SRSLTE_API int srslte_viterbi_decode_uc(srslte_viterbi_t *q, uint8_t *symbols, uint8_t *data, diff --git a/lib/include/srslte/phy/utils/vector.h b/lib/include/srslte/phy/utils/vector.h index a4028e495..ffcabebcb 100644 --- a/lib/include/srslte/phy/utils/vector.h +++ b/lib/include/srslte/phy/utils/vector.h @@ -141,8 +141,9 @@ SRSLTE_API uint32_t srslte_vec_max_abs_ci(const cf_t *x, const uint32_t len); /* quantify vector of floats or int16 and convert to uint8_t */ SRSLTE_API void srslte_vec_quant_fuc(const float *in, uint8_t *out, const float gain, const float offset, const float clip, const uint32_t len); +SRSLTE_API void srslte_vec_quant_fus(float *in, uint16_t *out, float gain, float offset, float clip, uint32_t len); SRSLTE_API void srslte_vec_quant_suc(const int16_t *in, uint8_t *out, const float gain, const int16_t offset, const int16_t clip, const uint32_t len); - +SRSLTE_API void srslte_vec_quant_sus(const int16_t *in, uint16_t *out, const float gain, const int16_t offset, const uint32_t len); /* magnitude of each vector element */ SRSLTE_API void srslte_vec_abs_cf(const cf_t *x, float *abs, const uint32_t len); SRSLTE_API void srslte_vec_abs_square_cf(const cf_t *x, float *abs_square, const uint32_t len); diff --git a/lib/src/phy/fec/test/viterbi_test.c b/lib/src/phy/fec/test/viterbi_test.c index f619b50b9..91be1bdd9 100644 --- a/lib/src/phy/fec/test/viterbi_test.c +++ b/lib/src/phy/fec/test/viterbi_test.c @@ -36,8 +36,10 @@ #include "viterbi_test.h" +#define VITERBI_16 -int frame_length = 1000, nof_frames = 128; + +int frame_length = 1000, nof_frames = 256; float ebno_db = 100.0; uint32_t seed = 0; bool tail_biting = false; @@ -84,6 +86,7 @@ void parse_args(int argc, char **argv) { int main(int argc, char **argv) { int frame_cnt; float *llr; + uint16_t *llr_s; uint8_t *llr_c; uint8_t *data_tx, *data_rx, *data_rx2, *symbols; int i, j; @@ -154,6 +157,11 @@ int main(int argc, char **argv) { perror("malloc"); exit(-1); } + llr_s = malloc(2 * coded_length * sizeof(uint16_t)); + if (!llr_s) { + perror("malloc"); + exit(-1); + } llr_c = malloc(2 * coded_length * sizeof(uint8_t)); if (!llr_c) { perror("malloc"); @@ -177,7 +185,7 @@ int main(int argc, char **argv) { snr_points = 1; } - float Gain = 32; + float Gain = 2500; for (i = 0; i < snr_points; i++) { frame_cnt = 0; @@ -206,17 +214,22 @@ int main(int argc, char **argv) { } srslte_ch_awgn_f(llr, llr, var[i], coded_length); + //srslte_vec_fprint_f(stdout, llr, 100); + + srslte_vec_quant_fuc(llr, llr_c, 32, 127.5, 255, coded_length); + srslte_vec_quant_fus(llr, llr_s, 8192, 32767.5, 65535, coded_length); - srslte_vec_quant_fuc(llr, llr_c, Gain, 127.5, 255, coded_length); - struct timeval t[3]; gettimeofday(&t[1], NULL); int M = 1; - - //srslte_vec_fprint_b(stdout, data_tx, frame_length); + for (int i=0;iframebits); return -1; } + + /* Initialize Viterbi decoder */ + init_viterbi37_avx2_16bit(q->ptr, q->tail_biting?-1:0); + + /* Decode block */ + if (q->tail_biting) { + for (int i=0;itmp_s[i*3*frame_length], symbols, 3*frame_length*sizeof(uint16_t)); + } + update_viterbi37_blk_avx2_16bit(q->ptr, q->tmp_s, TB_ITER*frame_length, &best_state); + chainback_viterbi37_avx2_16bit(q->ptr, q->tmp, TB_ITER*frame_length, best_state); + memcpy(data, &q->tmp[((int) (TB_ITER/2))*frame_length], frame_length*sizeof(uint8_t)); + } else { + update_viterbi37_blk_avx2_16bit(q->ptr, symbols, frame_length+q->K-1, NULL); + chainback_viterbi37_avx2_16bit(q->ptr, data, frame_length, 0); + } + + return q->framebits; +} + +void free37_avx2_16bit(void *o) { + srslte_viterbi_t *q = o; + + if (q->symbols_uc) { + free(q->symbols_uc); + } + if (q->tmp) { + free(q->tmp); + } + delete_viterbi37_avx2_16bit(q->ptr); +} + + +int decode37_avx2(void *o, uint8_t *symbols, uint8_t *data, uint32_t frame_length) { + srslte_viterbi_t *q = o; + uint32_t best_state; + + if (frame_length > q->framebits) { + fprintf(stderr, "Initialized decoder for max frame length %d bits\n", + q->framebits); + return -1; + } /* Initialize Viterbi decoder */ init_viterbi37_avx2(q->ptr, q->tail_biting?-1:0); - /* Decode block */ if (q->tail_biting) { for (int i=0;iptr); } + #endif #ifdef HAVE_NEON @@ -368,6 +417,44 @@ int init37_avx2(srslte_viterbi_t *q, int poly[3], uint32_t framebits, bool tail_ return 0; } } + +int init37_avx2_16bit(srslte_viterbi_t *q, int poly[3], uint32_t framebits, bool tail_biting) { + q->K = 7; + q->R = 3; + q->framebits = framebits; + q->gain_quant_s = 4; + q->gain_quant = DEFAULT_GAIN; + q->tail_biting = tail_biting; + q->decode_s = decode37_avx2_16bit; + q->free = free37_avx2_16bit; + q->decode_f = NULL; + q->symbols_uc = srslte_vec_malloc(3 * (q->framebits + q->K - 1) * sizeof(uint8_t)); + q->symbols_us = srslte_vec_malloc(3 * (q->framebits + q->K - 1) * sizeof(uint16_t)); + if (!q->symbols_uc || !q->symbols_us) { + perror("malloc"); + return -1; + } + if (q->tail_biting) { + q->tmp = srslte_vec_malloc(TB_ITER*3*(q->framebits + q->K - 1) * sizeof(uint8_t)); + q->tmp_s = srslte_vec_malloc(TB_ITER*3*(q->framebits + q->K - 1) * sizeof(uint16_t)); + if (!q->tmp) { + perror("malloc"); + free37(q); + return -1; + } + } else { + q->tmp = NULL; + } + //printf("pt0\n"); + if ((q->ptr = create_viterbi37_avx2_16bit(poly, TB_ITER*framebits)) == NULL) { + fprintf(stderr, "create_viterbi37 failed\n"); + free37(q); + return -1; + } else { + return 0; + } +} + #endif void srslte_viterbi_set_gain_quant(srslte_viterbi_t *q, float gain_quant) { @@ -383,8 +470,13 @@ int srslte_viterbi_init(srslte_viterbi_t *q, srslte_viterbi_type_t type, int pol switch (type) { case SRSLTE_VITERBI_37: #ifdef LV_HAVE_SSE + #ifdef LV_HAVE_AVX2 + #ifdef VITERBI_16 + return init37_avx2_16bit(q, poly, max_frame_length, tail_bitting); + #else return init37_avx2(q, poly, max_frame_length, tail_bitting); + #endif #else return init37_sse(q, poly, max_frame_length, tail_bitting); #endif @@ -444,8 +536,13 @@ int srslte_viterbi_decode_f(srslte_viterbi_t *q, float *symbols, uint8_t *data, max = fabs(symbols[i]); } } - srslte_vec_quant_fuc(symbols, q->symbols_uc, q->gain_quant/max, 127.5, 255, len); - return srslte_viterbi_decode_uc(q, q->symbols_uc, data, frame_length); +#ifdef VITERBI_16 + srslte_vec_quant_fus(symbols, q->symbols_us, DEFAULT_GAIN_16/max, 32767.5, 65535, len); + return srslte_viterbi_decode_us(q, q->symbols_us, data, frame_length); +#else + srslte_vec_quant_fuc(symbols, q->symbols_uc, q->gain_quant/max, 127.5, 255, len); + return srslte_viterbi_decode_uc(q, q->symbols_uc, data, frame_length); +#endif } else { return q->decode_f(q, symbols, data, frame_length); } @@ -472,8 +569,20 @@ int srslte_viterbi_decode_s(srslte_viterbi_t *q, int16_t *symbols, uint8_t *data max = abs(symbols[i]); } } - srslte_vec_quant_suc(symbols, q->symbols_uc, (float) q->gain_quant/max, 127, 255, len); - return srslte_viterbi_decode_uc(q, q->symbols_uc, data, frame_length); +#ifdef VITERBI_16 + srslte_vec_quant_sus(symbols, q->symbols_us, 1, 32767, len); + return srslte_viterbi_decode_us(q, q->symbols_us, data, frame_length); +#else + srslte_vec_quant_suc(symbols, q->symbols_uc, (float) q->gain_quant/max, 127, 255, len); + return srslte_viterbi_decode_uc(q, q->symbols_uc, data, frame_length); +#endif + + +} + +int srslte_viterbi_decode_us(srslte_viterbi_t *q, uint16_t *symbols, uint8_t *data, uint32_t frame_length) +{ + return q->decode_s(q, symbols, data, frame_length); } diff --git a/lib/src/phy/fec/viterbi37.h b/lib/src/phy/fec/viterbi37.h index 574f4fd87..a77145592 100644 --- a/lib/src/phy/fec/viterbi37.h +++ b/lib/src/phy/fec/viterbi37.h @@ -110,3 +110,23 @@ int update_viterbi37_blk_avx2(void *p, uint32_t *best_state); +void *create_viterbi37_avx2_16bit(int polys[3], + uint32_t len); + +int init_viterbi37_avx2_16bit(void *p, + int starting_state); + + +void reset_blk_avx2_16bit(void *p, int nbits); + +int chainback_viterbi37_avx2_16bit(void *p, + uint8_t *data, + uint32_t nbits, + uint32_t endstate); + +void delete_viterbi37_avx2_16bit(void *p); + +int update_viterbi37_blk_avx2_16bit(void *p, + uint16_t *syms, + uint32_t nbits, + uint32_t *best_state); \ No newline at end of file diff --git a/lib/src/phy/fec/viterbi37_avx2.c b/lib/src/phy/fec/viterbi37_avx2.c index a00bb494b..8735bf7ea 100644 --- a/lib/src/phy/fec/viterbi37_avx2.c +++ b/lib/src/phy/fec/viterbi37_avx2.c @@ -76,14 +76,16 @@ int init_viterbi37_avx2(void *p, int starting_state) { struct v37 *vp = p; uint32_t i; firstGo = 1; + for(i=0;i<64;i++) vp->metrics1.c[i] = 63; - + clear_v37_avx2(vp); vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; vp->dp = vp->decisions; + if (starting_state != -1) { vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ } @@ -259,7 +261,7 @@ void update_viterbi37_blk_avx2(void *p,unsigned char *syms,int nbits, uint32_t * d->s[0] = (short) y; d->s[1] = (short) x; d->s[2] = (short) (y >>16); - d->s[3] = (short)(x>> 16); + d->s[3] = (short) (x >>16); __m256i unpack; diff --git a/lib/src/phy/fec/viterbi37_avx2_16bit.c b/lib/src/phy/fec/viterbi37_avx2_16bit.c new file mode 100644 index 000000000..cda292017 --- /dev/null +++ b/lib/src/phy/fec/viterbi37_avx2_16bit.c @@ -0,0 +1,363 @@ +/* Adapted Phil Karn's r=1/3 k=9 viterbi decoder to r=1/3 k=7 + * + * K=15 r=1/6 Viterbi decoder for x86 SSE2 + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include +#include +#include +#include +#include +#include "parity.h" + +//#define DEBUG + +#ifdef LV_HAVE_AVX2 + + +#include +#include +#include +#include + +typedef union { + //unsigned char c[64]; + //__m128i v[4]; + unsigned short c[64]; + __m256i v[4]; +} metric_t; + +typedef union { + unsigned int w[2]; + unsigned char c[8]; + unsigned short s[4]; + __m64 v[1]; +} decision_t; + +union branchtab27 { + + //unsigned char c[32]; + //__m128i v[2]; + unsigned short c[32]; + __m256i v[2]; + +} Branchtab37_sse2[3]; + +int firstGo; +/* State info for instance of Viterbi decoder */ +struct v37 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ + uint32_t len; +}; + +void set_viterbi37_polynomial_avx2_16bit(int polys[3]) { + int state; + for(state=0;state < 32;state++){ + Branchtab37_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 65535:0; + Branchtab37_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 65535:0; + Branchtab37_sse2[2].c[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 65535:0; + } +} + +void clear_v37_avx2_16bit(struct v37 *vp) { + bzero(vp->decisions, sizeof(decision_t)*vp->len); + vp->dp = NULL; + bzero(&vp->metrics1, sizeof(metric_t)); + bzero(&vp->metrics2, sizeof(metric_t)); + vp->old_metrics = NULL; + vp->new_metrics = NULL; +} + + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi37_avx2_16bit(void *p, int starting_state) { + + struct v37 *vp = p; + uint32_t i; + + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + clear_v37_avx2_16bit(vp); +firstGo = 1; + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + if (starting_state != -1) { + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + } + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi37_avx2_16bit(int polys[3], uint32_t len) { + void *p; + struct v37 *vp; + + set_viterbi37_polynomial_avx2_16bit(polys); + + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m256i),sizeof(struct v37))) + return NULL; + + vp = (struct v37 *)p; + if(posix_memalign(&p, sizeof(__m256i),(len+6)*sizeof(decision_t))) { + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + vp->len = len+6; + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi37_avx2_16bit( + void *p, + uint8_t *data, /* Decoded output data */ + uint32_t nbits, /* Number of data bits */ + uint32_t endstate) { /* Terminal encoder state */ + struct v37 *vp = p; + + if (p == NULL) + return -1; + + decision_t *d = (decision_t *)vp->decisions; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits--) { + int k; + + k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; + endstate = (endstate >> 1) | (k << 7); + data[nbits] = k; + //printf("nbits=%d, endstate=%3d, k=%d, w[0]=%d, w[1]=%d, c=%d\n", nbits, endstate, k, d[nbits].s[1]&1, d[nbits].s[2]&1, d[nbits].c[(endstate>>2)/8]&1); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi37_avx2_16bit(void *p){ + struct v37 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +void print_256i(char *s, __m256i val) { + + printf("%s: ", s); + + uint16_t *x = (uint16_t*) &val; + for (int i=0;i<16;i++) { + printf("%.5f, ", (float)x[i]/65535); + } + printf("\n"); +} +void print_256i_char(char *s, __m256i val) { + + printf("%s: ", s); + + uint8_t *x = (uint8_t*) &val; + for (int i=0;i<32;i++) { + printf("%d, ",x[31-i]); + } + printf("\n"); +} + + +inline unsigned short my_mm256_movemask_epi16(__m256i x) { + uint32_t x1 = _mm256_movemask_epi8(x); + uint16_t tmp = 0; + for(int i = 0; i<16;i++){ + tmp |= ((x1 >> ((i*2)+1)) & 0x01) << i; + } + + return (tmp); + } + + +void update_viterbi37_blk_avx2_16bit(void *p, unsigned short *syms, int nbits, uint32_t *best_state) { + struct v37 *vp = p; + decision_t *d; + + if(p == NULL) + return; + +#ifdef DEBUG + printf("["); +#endif + + d = (decision_t *) vp->dp; + + for (int s=0;sold_metrics->v[i],metric); + m3 = _mm256_add_epi16(vp->old_metrics->v[2+i],metric); + m1 = _mm256_add_epi16(vp->old_metrics->v[2+i],m_metric); + m2 = _mm256_add_epi16(vp->old_metrics->v[i],m_metric); + + + /* Compare and select, using modulo arithmetic */ + + decision0 = _mm256_cmpgt_epi16(_mm256_sub_epi16(m0,m1),_mm256_setzero_si256()); + decision1 = _mm256_cmpgt_epi16(_mm256_sub_epi16(m2,m3),_mm256_setzero_si256()); + survivor0 = _mm256_or_si256(_mm256_and_si256(decision0,m1),_mm256_andnot_si256(decision0,m0)); + survivor1 = _mm256_or_si256(_mm256_and_si256(decision1,m3),_mm256_andnot_si256(decision1,m2)); + + + + /* Pack each set of decisions into 16 bits */ + + decision0 = _mm256_permute4x64_epi64(decision0,216); + decision1 = _mm256_permute4x64_epi64(decision1,216); + + __m256i packed = _mm256_packus_epi16( _mm256_srli_epi16(_mm256_unpacklo_epi16(decision0,decision1),8),_mm256_srli_epi16(_mm256_unpackhi_epi16(decision0,decision1),8)); + + d->w[i] = _mm256_movemask_epi8(packed); + + unsigned char temp_char1 = d->c[4*i + 1]; + unsigned char temp_char2 = d->c[4*i + 2]; + + d->c[4*i+1] = temp_char2; + d->c[4*i+2] = temp_char1; + + /* Store surviving metrics */ + survivor0 = _mm256_permute4x64_epi64(survivor0,216); + survivor1 = _mm256_permute4x64_epi64(survivor1,216); + + vp->new_metrics->v[2*i] = _mm256_unpacklo_epi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm256_unpackhi_epi16(survivor0,survivor1); + + } + + // See if we need to normalize + if (vp->new_metrics->c[0] > 25600) { + int i; + + uint16_t adjust; + __m256i adjustv; + union { __m256i v; signed short w[8]; } t; + + adjustv = vp->new_metrics->v[0]; + for(i=1;i<4;i++) { + adjustv = _mm256_min_epu16(adjustv,vp->new_metrics->v[i]); + } + + adjustv = _mm256_min_epu16(adjustv,_mm256_srli_si256(adjustv,16)); + adjustv = _mm256_min_epu16(adjustv,_mm256_srli_si256(adjustv,8)); + adjustv = _mm256_min_epu16(adjustv,_mm256_srli_si256(adjustv,4)); + + + t.v = adjustv; + adjust = t.w[0]; + adjustv = _mm256_set1_epi16(adjust); + + /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX + * This is okay since it can't overflow anyway + */ + for(i=0;i<4;i++) + vp->new_metrics->v[i] = _mm256_sub_epi16(vp->new_metrics->v[i],adjustv); + } + + + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + + if (best_state) { + uint32_t i, bst=0; + + uint16_t minmetric= UINT16_MAX; + for (i=0;i<64;i++) { + if (vp->old_metrics->c[i] <= minmetric) { + bst = i; + minmetric = vp->old_metrics->c[i]; + } + } + *best_state = bst; + } + + #ifdef DEBUG + printf("];\n===========================================\n"); +#endif + + vp->dp = d; +} + +#endif + + + diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c index 35457fcb5..324d80b99 100644 --- a/lib/src/phy/utils/vector.c +++ b/lib/src/phy/utils/vector.c @@ -363,6 +363,20 @@ uint32_t srslte_vec_max_abs_ci(const cf_t *x, const uint32_t len) { return srslte_vec_max_ci_simd(x, len); } +void srslte_vec_quant_fus(float *in, uint16_t *out, float gain, float offset, float clip, uint32_t len) { + int i; + long tmp; + + for (i=0;i clip) + tmp = clip; + out[i] = (uint16_t) tmp; + } +} + void srslte_vec_quant_fuc(const float *in, uint8_t *out, const float gain, const float offset, const float clip, const uint32_t len) { int i; int tmp; @@ -391,6 +405,18 @@ void srslte_vec_quant_suc(const int16_t *in, uint8_t *out, const float gain, con } } +void srslte_vec_quant_sus(const int16_t *in, uint16_t *out, const float gain, const int16_t offset, const uint32_t len) { + int i; + int16_t tmp; + + for (i=0;i