From c1b296eb2cb516329b6f66a48549c81cfff02ff0 Mon Sep 17 00:00:00 2001 From: Xavier Arteaga Date: Fri, 20 Oct 2017 18:17:07 +0200 Subject: [PATCH] SSE optimization for srslte_bit_interleave_w_offset --- lib/src/phy/utils/bit.c | 126 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/lib/src/phy/utils/bit.c b/lib/src/phy/utils/bit.c index 9ef53c35a..b1ae383a6 100644 --- a/lib/src/phy/utils/bit.c +++ b/lib/src/phy/utils/bit.c @@ -31,6 +31,12 @@ #include #include +#ifdef LV_HAVE_SSE + +#include + +#endif /* LV_HAVE_SSE */ + #include "srslte/phy/utils/bit.h" void srslte_bit_interleave(uint8_t *input, uint8_t *output, uint16_t *interleaver, uint32_t nof_bits) { @@ -53,6 +59,125 @@ void srslte_bit_interleave_w_offset(uint8_t *input, uint8_t *output, uint16_t *i } w_offset_p=8-w_offset; } +#ifdef LV_HAVE_SSE + __m64 m64mask = _mm_setr_pi8((uint8_t) 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1); + __m128i m128mask = _mm_set1_epi64(m64mask); + + union { + uint8_t v[8]; + __m64 m64; + } a, b, c; + + union { + __m128i m128; + uint16_t u16[8]; + uint8_t u8[16]; + struct { + __m64 reg_a; + __m64 reg_b; + } m64; + struct { + uint16_t i0, i1, i2, i3, i4, i5, i6, i7; + } v; + } ipx, epx, ipx2, epx2, b128, a128, c128; + + uint32_t i = st; + for (; i < (nof_bits / 8 - 1); i += 2) { + ipx.m128 = _mm_loadu_si128((__m128i *) (interleaver + (i * 8) - w_offset_p)); + epx.m128 = _mm_shuffle_epi8(ipx.m128, _mm_set_epi8(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E)); + ipx2.m128 = _mm_loadu_si128((__m128i *) (interleaver + ((i + 1) * 8) - w_offset_p)); + epx2.m128 = _mm_shuffle_epi8(ipx2.m128, _mm_set_epi8(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E)); + + epx.m64.reg_b = epx2.m64.reg_a; + + b128.m128 = _mm_and_si128(epx.m128, _mm_set1_epi8(0x7)); + b128.m128 = _mm_shuffle_epi8(m128mask, b128.m128); + + ipx.m128 = _mm_srli_epi16(ipx.m128, 3); + ipx2.m128 = _mm_srli_epi16(ipx2.m128, 3); + + a128.m128 = _mm_set_epi8(input[ipx2.v.i0], + input[ipx2.v.i1], + input[ipx2.v.i2], + input[ipx2.v.i3], + input[ipx2.v.i4], + input[ipx2.v.i5], + input[ipx2.v.i6], + input[ipx2.v.i7], + input[ipx.v.i0], + input[ipx.v.i1], + input[ipx.v.i2], + input[ipx.v.i3], + input[ipx.v.i4], + input[ipx.v.i5], + input[ipx.v.i6], + input[ipx.v.i7]); + + c128.m128 = _mm_cmpeq_epi8(_mm_and_si128(a128.m128, b128.m128), b128.m128); + uint16_t o = (uint16_t) _mm_movemask_epi8(c128.m128); + *((uint16_t *) (output + i)) = o; + } + + for (; i < nof_bits / 8; i++) { + ipx.m128 = _mm_loadu_si128((__m128i *) (interleaver + i * 8 - w_offset_p)); + epx.m128 = _mm_shuffle_epi8(ipx.m128, _mm_set_epi8(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E)); + b.m64 = _mm_and_si64(epx.m64.reg_a, _mm_set1_pi8(0x7)); + b.m64 = _mm_shuffle_pi8(m64mask, b.m64); + + ipx.m128 = _mm_srli_epi16(ipx.m128, 3); + + a.m64 = _mm_set_pi8(input[ipx.v.i0], + input[ipx.v.i1], + input[ipx.v.i2], + input[ipx.v.i3], + input[ipx.v.i4], + input[ipx.v.i5], + input[ipx.v.i6], + input[ipx.v.i7]); + + c.m64 = _mm_cmpeq_pi8(_mm_and_si64(a.m64, b.m64), b.m64); + output[i] = (uint8_t) _mm_movemask_pi8(c.m64); + } + +#if 0 + /* THIS PIECE OF CODE IS FOR CHECKING SIMD BEHAVIOUR. DO NOT ENABLE. */ + uint8_t *output2 = malloc(nof_bits/8); + for (i=st;i