From 15e75b2c65b35c573bab2fd68c7fb1d553741307 Mon Sep 17 00:00:00 2001 From: Xavier Arteaga Date: Mon, 23 Oct 2017 14:23:03 +0200 Subject: [PATCH] Created Bit interleaver object and SSE optimised interleaver --- lib/include/srslte/phy/fec/rm_turbo.h | 2 + lib/include/srslte/phy/utils/bit.h | 19 +++ lib/src/phy/fec/rm_turbo.c | 19 ++- lib/src/phy/fec/test/rm_turbo_test.c | 1 + lib/src/phy/fec/turbocoder.c | 10 +- lib/src/phy/phch/sch.c | 2 + lib/src/phy/utils/bit.c | 173 +++++++++++++++++++++++++- 7 files changed, 221 insertions(+), 5 deletions(-) diff --git a/lib/include/srslte/phy/fec/rm_turbo.h b/lib/include/srslte/phy/fec/rm_turbo.h index df0720b95..182b49fb8 100644 --- a/lib/include/srslte/phy/fec/rm_turbo.h +++ b/lib/include/srslte/phy/fec/rm_turbo.h @@ -58,6 +58,8 @@ SRSLTE_API int srslte_rm_turbo_tx(uint8_t *w_buff, SRSLTE_API void srslte_rm_turbo_gentables(); +SRSLTE_API void srslte_rm_turbo_free_tables(); + SRSLTE_API int srslte_rm_turbo_tx_lut(uint8_t *w_buff, uint8_t *systematic, uint8_t *parity, diff --git a/lib/include/srslte/phy/utils/bit.h b/lib/include/srslte/phy/utils/bit.h index c02ecd6bb..e4289b790 100644 --- a/lib/include/srslte/phy/utils/bit.h +++ b/lib/include/srslte/phy/utils/bit.h @@ -40,6 +40,25 @@ #include "srslte/config.h" +typedef struct { + uint32_t nof_bits; + uint16_t *interleaver; + uint16_t *byte_idx; + uint8_t *bit_mask; + uint8_t n_128; +} srslte_bit_interleaver_t; + +SRSLTE_API void srslte_bit_interleaver_init(srslte_bit_interleaver_t *q, + uint16_t *interleaver, + uint32_t nof_bits); + +SRSLTE_API void srslte_bit_interleaver_free(srslte_bit_interleaver_t *q); + +SRSLTE_API void srslte_bit_interleaver_run(srslte_bit_interleaver_t *q, + uint8_t *input, + uint8_t *output, + uint16_t w_offset); + SRSLTE_API void srslte_bit_interleave(uint8_t *input, uint8_t *output, uint16_t *interleaver, diff --git a/lib/src/phy/fec/rm_turbo.c b/lib/src/phy/fec/rm_turbo.c index cdc8ac88d..b1cc95a8c 100644 --- a/lib/src/phy/fec/rm_turbo.c +++ b/lib/src/phy/fec/rm_turbo.c @@ -61,7 +61,9 @@ static uint8_t RM_PERM_TC[NCOLS] = { 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, /* Align tables to 16-byte boundary */ static uint16_t interleaver_systematic_bits[192][6160]; // 4 tail bits +static srslte_bit_interleaver_t bit_interleavers_systematic_bits[192]; static uint16_t interleaver_parity_bits[192][2*6160]; +static srslte_bit_interleaver_t bit_interleavers_parity_bits[192]; static uint16_t deinterleaver[192][4][18448]; static int k0_vec[SRSLTE_NOF_TC_CB_SIZES][4][2]; static bool rm_turbo_tables_generated = false; @@ -235,7 +237,12 @@ void srslte_rm_turbo_gentables() { k0_vec[cb_idx][i][1] = -1; } srslte_rm_turbo_gentable_systematic(interleaver_systematic_bits[cb_idx], k0_vec[cb_idx], nrows, ndummy); + srslte_bit_interleaver_init(&bit_interleavers_systematic_bits[cb_idx], interleaver_systematic_bits[cb_idx], + (uint32_t) srslte_cbsegm_cbsize(cb_idx)+4); + srslte_rm_turbo_gentable_parity(interleaver_parity_bits[cb_idx], k0_vec[cb_idx], in_len/3, nrows, ndummy); + srslte_bit_interleaver_init(&bit_interleavers_parity_bits[cb_idx], interleaver_parity_bits[cb_idx], + (uint32_t) (srslte_cbsegm_cbsize(cb_idx)+4)*2); for (int i=0;i<4;i++) { srslte_rm_turbo_gentable_receive(deinterleaver[cb_idx][i], in_len, i); @@ -244,6 +251,12 @@ void srslte_rm_turbo_gentables() { } } +void srslte_rm_turbo_free_tables () { + for (int i = 0; i < SRSLTE_NOF_TC_CB_SIZES; i++) { + srslte_bit_interleaver_free(&bit_interleavers_systematic_bits[i]); + srslte_bit_interleaver_free(&bit_interleavers_parity_bits[i]); + } +} /** * Rate matching for LTE Turbo Coder @@ -274,11 +287,13 @@ int srslte_rm_turbo_tx_lut(uint8_t *w_buff, uint8_t *systematic, uint8_t *parity if (rv_idx == 0) { // Systematic bits - srslte_bit_interleave(systematic, w_buff, interleaver_systematic_bits[cb_idx], in_len/3); + //srslte_bit_interleave(systematic, w_buff, interleaver_systematic_bits[cb_idx], in_len/3); + srslte_bit_interleaver_run(&bit_interleavers_systematic_bits[cb_idx], systematic, w_buff, 0); // Parity bits - srslte_bit_interleave_w_offset(parity, &w_buff[in_len/24], interleaver_parity_bits[cb_idx], 2*in_len/3, 4); + //srslte_bit_interleave_w_offset(parity, &w_buff[in_len/24], interleaver_parity_bits[cb_idx], 2*in_len/3, 4); + srslte_bit_interleaver_run(&bit_interleavers_parity_bits[cb_idx], parity, &w_buff[in_len/24], 4); } /* Bit selection and transmission 5.1.4.1.2 */ diff --git a/lib/src/phy/fec/test/rm_turbo_test.c b/lib/src/phy/fec/test/rm_turbo_test.c index 6943c72d1..1c5b2abf5 100644 --- a/lib/src/phy/fec/test/rm_turbo_test.c +++ b/lib/src/phy/fec/test/rm_turbo_test.c @@ -197,6 +197,7 @@ int main(int argc, char **argv) { } } + srslte_rm_turbo_free_tables(); free(rm_bits); free(rm_bits2); free(rm_bits2_bytes); diff --git a/lib/src/phy/fec/turbocoder.c b/lib/src/phy/fec/turbocoder.c index b7785980c..64f05b5e8 100644 --- a/lib/src/phy/fec/turbocoder.c +++ b/lib/src/phy/fec/turbocoder.c @@ -43,6 +43,7 @@ uint8_t tcod_lut_next_state[188][8][256]; uint8_t tcod_lut_output[188][8][256]; uint16_t tcod_per_fw[188][6144]; +static srslte_bit_interleaver_t tcod_interleavers[188]; static bool table_initiated = false; @@ -63,6 +64,9 @@ void srslte_tcod_free(srslte_tcod_t *h) { if (h->temp) { free(h->temp); } + for (int i = 0; i < 188; i++) { + srslte_bit_interleaver_free(&tcod_interleavers[i]); + } } /* Expects bits (1 byte = 1 bit) and produces bits. The systematic and parity bits are interlaced in the output */ @@ -198,8 +202,9 @@ int srslte_tcod_encode_lut(srslte_tcod_t *h, uint8_t *input, uint8_t *parity, ui } parity[long_cb/8] = 0; // will put tail here later - /* Interleave input */ - srslte_bit_interleave(input, h->temp, tcod_per_fw[cblen_idx], long_cb); + /* Interleave input */ + srslte_bit_interleaver_run(&tcod_interleavers[cblen_idx], input, h->temp, 0); + //srslte_bit_interleave(input, h->temp, tcod_per_fw[cblen_idx], long_cb); /* Parity bits for the 2nd constituent encoders */ uint8_t state1 = 0; @@ -297,6 +302,7 @@ void srslte_tcod_gentable() { for (uint32_t i=0;icb_in) { free(q->cb_in); } diff --git a/lib/src/phy/utils/bit.c b/lib/src/phy/utils/bit.c index b1ae383a6..809d4c392 100644 --- a/lib/src/phy/utils/bit.c +++ b/lib/src/phy/utils/bit.c @@ -30,6 +30,7 @@ #include #include #include +#include #ifdef LV_HAVE_SSE @@ -38,6 +39,172 @@ #endif /* LV_HAVE_SSE */ #include "srslte/phy/utils/bit.h" +#include "srslte/phy/utils/vector.h" + +void srslte_bit_interleaver_init(srslte_bit_interleaver_t *q, + uint16_t *interleaver, + uint32_t nof_bits) { + static const uint8_t mask[] = { 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1 }; + + bzero(q, sizeof(srslte_bit_interleaver_t)); + + q->interleaver = srslte_vec_malloc(sizeof(uint16_t)*nof_bits); + q->byte_idx = srslte_vec_malloc(sizeof(uint16_t)*nof_bits); + q->bit_mask = srslte_vec_malloc(sizeof(uint8_t)*nof_bits); + q->nof_bits = nof_bits; + + for (int i = 0; i < nof_bits; i++) { + uint16_t i_px = interleaver[i]; + q->interleaver[i] = i_px; + q->byte_idx[i] = (uint16_t) (interleaver[i] / 8); + q->bit_mask[i] = (uint8_t) (mask[i_px%8]); + } +} + +void srslte_bit_interleaver_free(srslte_bit_interleaver_t *q) { + if (q->interleaver) { + free(q->interleaver); + } + + if (q->byte_idx) { + free(q->byte_idx); + } + + if (q->bit_mask) { + free(q->bit_mask); + } + + bzero(q, sizeof(srslte_bit_interleaver_t)); +} + +void srslte_bit_interleaver_run(srslte_bit_interleaver_t *q, uint8_t *input, uint8_t *output, uint16_t w_offset) { + static const uint8_t mask[] = { 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1 }; + uint16_t *byte_idx = q->byte_idx; + uint8_t *bit_mask = q->bit_mask; + uint8_t *output_ptr = output; + + uint32_t st=0, w_offset_p=0; + + if (w_offset < 8 && w_offset > 0) { + st=1; + for (uint32_t j=0;j<8-w_offset;j++) { + uint16_t i_p = q->interleaver[j]; + if (input[i_p/8] & mask[i_p%8]) { + output[0] |= mask[j+w_offset]; + } else { + output[0] &= ~(mask[j+w_offset]); + } + } + w_offset_p=8-w_offset; + } + + uint32_t i = st * 8; + + byte_idx += i - w_offset_p; + bit_mask += i - w_offset_p; + output_ptr += st; + +#ifdef LV_HAVE_SSE + for(; i < q->nof_bits - 15; i += 16) { + __m128i in128; + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x7); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x6); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x5); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x4); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x3); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x2); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x1); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x0); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0xF); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0xE); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0xD); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0xC); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0xB); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0xA); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x9); + in128 = _mm_insert_epi8(in128, input[*(byte_idx++)], 0x8); + + __m128i mask128 = _mm_loadu_si128((__m128i *) bit_mask); + mask128 = _mm_shuffle_epi8(mask128, _mm_set_epi8(0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7)); + + __m128i cmp128 = _mm_cmpeq_epi8(_mm_and_si128(in128, mask128), mask128); + *((uint16_t *) (output_ptr)) = (uint16_t) _mm_movemask_epi8(cmp128); + + bit_mask += 16; + output_ptr += 2; + } + +#endif /* LV_HAVE_SSE */ + + for(; i < q->nof_bits; i += 8) { + uint8_t out0 = (input[*(byte_idx++)] & *(bit_mask++))?mask[0]:(uint8_t)0; + uint8_t out1 = (input[*(byte_idx++)] & *(bit_mask++))?mask[1]:(uint8_t)0; + uint8_t out2 = (input[*(byte_idx++)] & *(bit_mask++))?mask[2]:(uint8_t)0; + uint8_t out3 = (input[*(byte_idx++)] & *(bit_mask++))?mask[3]:(uint8_t)0; + uint8_t out4 = (input[*(byte_idx++)] & *(bit_mask++))?mask[4]:(uint8_t)0; + uint8_t out5 = (input[*(byte_idx++)] & *(bit_mask++))?mask[5]:(uint8_t)0; + uint8_t out6 = (input[*(byte_idx++)] & *(bit_mask++))?mask[6]:(uint8_t)0; + uint8_t out7 = (input[*(byte_idx++)] & *(bit_mask++))?mask[7]:(uint8_t)0; + + *output_ptr = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; + output_ptr++; + } + + for (uint32_t j=0;jnof_bits%8;j++) { + uint16_t i_p = q->interleaver[(q->nof_bits/8)*8+j-w_offset]; + if (input[i_p/8] & mask[i_p%8]) { + output[q->nof_bits/8] |= mask[j]; + } else { + output[q->nof_bits/8] &= ~(mask[j]); + } + } + for (uint32_t j=0;jinterleaver[(q->nof_bits/8)*8+j-w_offset]; + if (input[i_p/8] & (1<<(7-i_p%8))) { + output[q->nof_bits/8] |= mask[j]; + } else { + output[q->nof_bits/8] &= ~(mask[j]); + } + } + +#if 0 + /* THIS PIECE OF CODE IS FOR CHECKING SIMD BEHAVIOUR. DO NOT ENABLE. */ + uint8_t *output2 = malloc(q->nof_bits/8); + for (i=st;inof_bits/8;i++) { + + uint16_t i_p0 = q->interleaver[i*8+0-w_offset_p]; + uint16_t i_p1 = q->interleaver[i*8+1-w_offset_p]; + uint16_t i_p2 = q->interleaver[i*8+2-w_offset_p]; + uint16_t i_p3 = q->interleaver[i*8+3-w_offset_p]; + uint16_t i_p4 = q->interleaver[i*8+4-w_offset_p]; + uint16_t i_p5 = q->interleaver[i*8+5-w_offset_p]; + uint16_t i_p6 = q->interleaver[i*8+6-w_offset_p]; + uint16_t i_p7 = q->interleaver[i*8+7-w_offset_p]; + + uint8_t out0 = (input[i_p0/8] & mask[i_p0%8])?mask[0]:(uint8_t)0; + uint8_t out1 = (input[i_p1/8] & mask[i_p1%8])?mask[1]:(uint8_t)0; + uint8_t out2 = (input[i_p2/8] & mask[i_p2%8])?mask[2]:(uint8_t)0; + uint8_t out3 = (input[i_p3/8] & mask[i_p3%8])?mask[3]:(uint8_t)0; + uint8_t out4 = (input[i_p4/8] & mask[i_p4%8])?mask[4]:(uint8_t)0; + uint8_t out5 = (input[i_p5/8] & mask[i_p5%8])?mask[5]:(uint8_t)0; + uint8_t out6 = (input[i_p6/8] & mask[i_p6%8])?mask[6]:(uint8_t)0; + uint8_t out7 = (input[i_p7/8] & mask[i_p7%8])?mask[7]:(uint8_t)0; + + output2[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; + } + + for(i = st; i < q->nof_bits/8; i++) { + if (true || output[i] != output2[i]) { + printf("%05d/%05d %02X %02X\n", i, q->nof_bits/8, output[i], output2[i]); + } + //output[i] = output2[i]; + } + free(output2); +#endif +} + + void srslte_bit_interleave(uint8_t *input, uint8_t *output, uint16_t *interleaver, uint32_t nof_bits) { srslte_bit_interleave_w_offset(input, output, interleaver, nof_bits, 0); @@ -90,7 +257,11 @@ void srslte_bit_interleave_w_offset(uint8_t *input, uint8_t *output, uint16_t *i epx2.m128 = _mm_shuffle_epi8(ipx2.m128, _mm_set_epi8(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E)); - epx.m64.reg_b = epx2.m64.reg_a; + epx.m128 = _mm_blendv_epi8(epx.m128, epx2.m128, _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, + (uint8_t) 0xFF, (uint8_t) 0xFF, + (uint8_t) 0xFF, (uint8_t) 0xFF, + (uint8_t) 0xFF, (uint8_t) 0xFF, + (uint8_t) 0xFF, (uint8_t) 0xFF)); b128.m128 = _mm_and_si128(epx.m128, _mm_set1_epi8(0x7)); b128.m128 = _mm_shuffle_epi8(m128mask, b128.m128);