From a44671fc773c3eb47014f8fdb1142005b40d6352 Mon Sep 17 00:00:00 2001 From: Vasil Velichkov Date: Thu, 17 Oct 2019 02:22:37 +0300 Subject: [PATCH] PHY: Manually unroll loops that use _mm_extract_epi(8/16) clang does not unroll those loops even though it supports the -funroll-loops command line option, adding various #pragma unroll options also does not help. The unroll is needed to make the second argument a constant integer. Enable the SSE/AVX turbo rate matching when compiling in debug mode. srsLTE/lib/src/phy/fec/rm_turbo.c:590:33: error: argument to '__builtin_ia32_vec_ext_v16qi' must be a constant integer int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /usr/lib64/clang/7.0.1/include/smmintrin.h:1048:23: note: expanded from macro '_mm_extract_epi8' (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ ^ srsLTE/lib/src/phy/fec/rm_turbo.c:591:35: error: argument to '__builtin_ia32_vec_ext_v8hi' must be a constant integer uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /usr/lib64/clang/7.0.1/include/emmintrin.h:4273:24: note: expanded from macro '_mm_extract_epi16' (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ --- lib/src/phy/fec/rm_turbo.c | 123 +++++++++++++++++++++----------- lib/src/phy/utils/vector_simd.c | 111 +++++++++++++++++++--------- 2 files changed, 158 insertions(+), 76 deletions(-) diff --git a/lib/src/phy/fec/rm_turbo.c b/lib/src/phy/fec/rm_turbo.c index ee2bfd8d1..9d99d6974 100644 --- a/lib/src/phy/fec/rm_turbo.c +++ b/lib/src/phy/fec/rm_turbo.c @@ -32,12 +32,6 @@ #include "srslte/phy/utils/debug.h" #include "srslte/phy/utils/vector.h" -#ifdef DEBUG_MODE -#pragma message "FIXME: Disabling SSE/AVX turbo rate matching" -#undef LV_HAVE_SSE -#undef LV_HAVE_AVX -#endif - #ifdef LV_HAVE_SSE #include int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); @@ -470,6 +464,11 @@ int srslte_rm_turbo_rx_lut_8bit(int8_t *input, int8_t *output, uint32_t in_len, #ifdef LV_HAVE_SSE +#define SAVE_OUTPUT_16_SSE(j) \ + x = (int16_t)_mm_extract_epi16(xVal, j); \ + l = (uint16_t)_mm_extract_epi16(lutVal, j); \ + output[l] += x; + int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) { if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) { @@ -478,18 +477,25 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte const __m128i* xPtr = (const __m128i*) input; const __m128i* lutPtr = (const __m128i*) deinter; __m128i xVal, lutVal; - + + int16_t x; + uint16_t l; + /* Simplify load if we do not need to wrap (ie high rates) */ if (in_len <= out_len) { for (int i=0;i= out_len && inputCnt < in_len - 16) { @@ -635,9 +673,10 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei #ifdef LV_HAVE_AVX -#define SAVE_OUTPUT(j) x = (int16_t) _mm256_extract_epi16(xVal, j);\ - l = (uint16_t) _mm256_extract_epi16(lutVal, j);\ - output[l] += x; +#define SAVE_OUTPUT(j) \ + x = (int16_t)_mm256_extract_epi16(xVal, j); \ + l = (uint16_t)_mm256_extract_epi16(lutVal, j); \ + output[l] += x; int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) { diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c index ba67a73a7..9213ee12a 100644 --- a/lib/src/phy/utils/vector_simd.c +++ b/lib/src/phy/utils/vector_simd.c @@ -276,35 +276,50 @@ void srslte_vec_neg_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, const } } +#define SAVE_OUTPUT_16_SSE(j) \ + x = (int16_t)_mm_extract_epi16(xVal, j); \ + l = (uint16_t)_mm_extract_epi16(lutVal, j); \ + y[l] = (short)x; + /* No improvement with AVX */ void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len) { int i = 0; #ifdef LV_HAVE_SSE -#ifndef DEBUG_MODE if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) { for (; i < len - 7; i += 8) { __m128i xVal = _mm_load_si128((__m128i *) &x[i]); __m128i lutVal = _mm_load_si128((__m128i *) &lut[i]); - for (int k = 0; k < 8; k++) { - int16_t x = (int16_t) _mm_extract_epi16(xVal, k); - uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k); - y[l] = (short) x; - } + int16_t x; + uint16_t l; + + SAVE_OUTPUT_16_SSE(0); + SAVE_OUTPUT_16_SSE(1); + SAVE_OUTPUT_16_SSE(2); + SAVE_OUTPUT_16_SSE(3); + SAVE_OUTPUT_16_SSE(4); + SAVE_OUTPUT_16_SSE(5); + SAVE_OUTPUT_16_SSE(6); + SAVE_OUTPUT_16_SSE(7); } } else { for (; i < len - 7; i += 8) { __m128i xVal = _mm_loadu_si128((__m128i *) &x[i]); __m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]); - for (int k = 0; k < 8; k++) { - int16_t x = (int16_t) _mm_extract_epi16(xVal, k); - uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k); - y[l] = (short) x; - } + int16_t x; + uint16_t l; + + SAVE_OUTPUT_16_SSE(0); + SAVE_OUTPUT_16_SSE(1); + SAVE_OUTPUT_16_SSE(2); + SAVE_OUTPUT_16_SSE(3); + SAVE_OUTPUT_16_SSE(4); + SAVE_OUTPUT_16_SSE(5); + SAVE_OUTPUT_16_SSE(6); + SAVE_OUTPUT_16_SSE(7); } } -#endif #endif for (; i < len; i++) { @@ -312,26 +327,45 @@ void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y } } +#define SAVE_OUTPUT_SSE_8(j) \ + x = (int8_t)_mm_extract_epi8(xVal, j); \ + l = (uint16_t)_mm_extract_epi16(lutVal1, j); \ + y[l] = (char)x; + +#define SAVE_OUTPUT_SSE_8_2(j) \ + x = (int8_t)_mm_extract_epi8(xVal, j + 8); \ + l = (uint16_t)_mm_extract_epi16(lutVal2, j); \ + y[l] = (char)x; + void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t *y, const int len) { int i = 0; #ifdef LV_HAVE_SSE -#ifndef DEBUG_MODE if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) { for (; i < len - 15; i += 16) { __m128i xVal = _mm_load_si128((__m128i *) &x[i]); __m128i lutVal1 = _mm_load_si128((__m128i *) &lut[i]); __m128i lutVal2 = _mm_load_si128((__m128i *) &lut[i+8]); - for (int k = 0; k < 8; k++) { - int8_t x = (int8_t) _mm_extract_epi8(xVal, k); - uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k); - y[l] = (char) x; - } - for (int k = 0; k < 8; k++) { - int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8); - uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k); - y[l] = (char) x; - } + int8_t x; + uint16_t l; + + SAVE_OUTPUT_SSE_8(0); + SAVE_OUTPUT_SSE_8(1); + SAVE_OUTPUT_SSE_8(2); + SAVE_OUTPUT_SSE_8(3); + SAVE_OUTPUT_SSE_8(4); + SAVE_OUTPUT_SSE_8(5); + SAVE_OUTPUT_SSE_8(6); + SAVE_OUTPUT_SSE_8(7); + + SAVE_OUTPUT_SSE_8_2(0); + SAVE_OUTPUT_SSE_8_2(1); + SAVE_OUTPUT_SSE_8_2(2); + SAVE_OUTPUT_SSE_8_2(3); + SAVE_OUTPUT_SSE_8_2(4); + SAVE_OUTPUT_SSE_8_2(5); + SAVE_OUTPUT_SSE_8_2(6); + SAVE_OUTPUT_SSE_8_2(7); } } else { for (; i < len - 15; i += 16) { @@ -339,19 +373,28 @@ void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t __m128i lutVal1 = _mm_loadu_si128((__m128i *) &lut[i]); __m128i lutVal2 = _mm_loadu_si128((__m128i *) &lut[i+8]); - for (int k = 0; k < 8; k++) { - int8_t x = (int8_t) _mm_extract_epi8(xVal, k); - uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k); - y[l] = (char) x; - } - for (int k = 0; k < 8; k++) { - int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8); - uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k); - y[l] = (char) x; - } + int8_t x; + uint16_t l; + + SAVE_OUTPUT_SSE_8(0); + SAVE_OUTPUT_SSE_8(1); + SAVE_OUTPUT_SSE_8(2); + SAVE_OUTPUT_SSE_8(3); + SAVE_OUTPUT_SSE_8(4); + SAVE_OUTPUT_SSE_8(5); + SAVE_OUTPUT_SSE_8(6); + SAVE_OUTPUT_SSE_8(7); + + SAVE_OUTPUT_SSE_8_2(0); + SAVE_OUTPUT_SSE_8_2(1); + SAVE_OUTPUT_SSE_8_2(2); + SAVE_OUTPUT_SSE_8_2(3); + SAVE_OUTPUT_SSE_8_2(4); + SAVE_OUTPUT_SSE_8_2(5); + SAVE_OUTPUT_SSE_8_2(6); + SAVE_OUTPUT_SSE_8_2(7); } } -#endif #endif for (; i < len; i++) {