PHY: Manually unroll loops that use _mm_extract_epi(8/16)

clang does not unroll those loops even though it supports the
-funroll-loops command line option, adding various #pragma unroll
options also does not help.

The unroll is needed to make the second argument a constant integer.

Enable the SSE/AVX turbo rate matching when compiling in debug mode.

srsLTE/lib/src/phy/fec/rm_turbo.c:590:33: error: argument to '__builtin_ia32_vec_ext_v16qi' must be a constant integer
          int8_t x  = (int8_t)  _mm_extract_epi8(xVal,   j+8);
                                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/lib64/clang/7.0.1/include/smmintrin.h:1048:23: note: expanded from macro '_mm_extract_epi8'
  (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
                      ^
srsLTE/lib/src/phy/fec/rm_turbo.c:591:35: error: argument to '__builtin_ia32_vec_ext_v8hi' must be a constant integer
          uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j);
                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/lib64/clang/7.0.1/include/emmintrin.h:4273:24: note: expanded from macro '_mm_extract_epi16'
  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
master
Vasil Velichkov 5 years ago committed by Andre Puschmann
parent af2b4ecc79
commit a44671fc77

@ -32,12 +32,6 @@
#include "srslte/phy/utils/debug.h" #include "srslte/phy/utils/debug.h"
#include "srslte/phy/utils/vector.h" #include "srslte/phy/utils/vector.h"
#ifdef DEBUG_MODE
#pragma message "FIXME: Disabling SSE/AVX turbo rate matching"
#undef LV_HAVE_SSE
#undef LV_HAVE_AVX
#endif
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#include <x86intrin.h> #include <x86intrin.h>
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx); int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
@ -470,6 +464,11 @@ int srslte_rm_turbo_rx_lut_8bit(int8_t *input, int8_t *output, uint32_t in_len,
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#define SAVE_OUTPUT_16_SSE(j) \
x = (int16_t)_mm_extract_epi16(xVal, j); \
l = (uint16_t)_mm_extract_epi16(lutVal, j); \
output[l] += x;
int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{ {
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) { if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
@ -478,18 +477,25 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte
const __m128i* xPtr = (const __m128i*) input; const __m128i* xPtr = (const __m128i*) input;
const __m128i* lutPtr = (const __m128i*) deinter; const __m128i* lutPtr = (const __m128i*) deinter;
__m128i xVal, lutVal; __m128i xVal, lutVal;
int16_t x;
uint16_t l;
/* Simplify load if we do not need to wrap (ie high rates) */ /* Simplify load if we do not need to wrap (ie high rates) */
if (in_len <= out_len) { if (in_len <= out_len) {
for (int i=0;i<in_len/8;i++) { for (int i=0;i<in_len/8;i++) {
xVal = _mm_loadu_si128(xPtr); xVal = _mm_loadu_si128(xPtr);
lutVal = _mm_loadu_si128(lutPtr); lutVal = _mm_loadu_si128(lutPtr);
for (int j=0;j<8;j++) { SAVE_OUTPUT_16_SSE(0);
int16_t x = (int16_t) _mm_extract_epi16(xVal, j); SAVE_OUTPUT_16_SSE(1);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j); SAVE_OUTPUT_16_SSE(2);
output[l] += x; SAVE_OUTPUT_16_SSE(3);
} SAVE_OUTPUT_16_SSE(4);
SAVE_OUTPUT_16_SSE(5);
SAVE_OUTPUT_16_SSE(6);
SAVE_OUTPUT_16_SSE(7);
xPtr ++; xPtr ++;
lutPtr ++; lutPtr ++;
} }
@ -503,12 +509,16 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte
while(inputCnt < in_len - 8) { while(inputCnt < in_len - 8) {
xVal = _mm_loadu_si128(xPtr); xVal = _mm_loadu_si128(xPtr);
lutVal = _mm_loadu_si128(lutPtr); lutVal = _mm_loadu_si128(lutPtr);
for (int j=0;j<8;j++) { SAVE_OUTPUT_16_SSE(0);
int16_t x = (int16_t) _mm_extract_epi16(xVal, j); SAVE_OUTPUT_16_SSE(1);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j); SAVE_OUTPUT_16_SSE(2);
output[l] += x; SAVE_OUTPUT_16_SSE(3);
} SAVE_OUTPUT_16_SSE(4);
SAVE_OUTPUT_16_SSE(5);
SAVE_OUTPUT_16_SSE(6);
SAVE_OUTPUT_16_SSE(7);
xPtr++; xPtr++;
lutPtr++; lutPtr++;
intCnt += 8; intCnt += 8;
@ -539,6 +549,16 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint16_t *deinte
} }
} }
#define SAVE_OUTPUT_SSE_8(j) \
x = (int8_t)_mm_extract_epi8(xVal, j); \
l = (uint16_t)_mm_extract_epi16(lutVal1, j); \
output[l] += x;
#define SAVE_OUTPUT_SSE_8_2(j) \
x = (int8_t)_mm_extract_epi8(xVal, j + 8); \
l = (uint16_t)_mm_extract_epi16(lutVal2, j); \
output[l] += x;
int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{ {
if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) { if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
@ -548,6 +568,9 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
const __m128i* lutPtr = (const __m128i*) deinter; const __m128i* lutPtr = (const __m128i*) deinter;
__m128i xVal, lutVal1, lutVal2; __m128i xVal, lutVal1, lutVal2;
int8_t x;
uint16_t l;
/* Simplify load if we do not need to wrap (ie high rates) */ /* Simplify load if we do not need to wrap (ie high rates) */
if (in_len <= out_len) { if (in_len <= out_len) {
for (int i=0;i<in_len/16;i++) { for (int i=0;i<in_len/16;i++) {
@ -558,16 +581,23 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
lutVal2 = _mm_loadu_si128(lutPtr); lutVal2 = _mm_loadu_si128(lutPtr);
lutPtr ++; lutPtr ++;
for (int j=0;j<8;j++) { SAVE_OUTPUT_SSE_8(0);
int8_t x = (int8_t) _mm_extract_epi8(xVal, j); SAVE_OUTPUT_SSE_8(1);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j); SAVE_OUTPUT_SSE_8(2);
output[l] += x; SAVE_OUTPUT_SSE_8(3);
} SAVE_OUTPUT_SSE_8(4);
for (int j=0;j<8;j++) { SAVE_OUTPUT_SSE_8(5);
int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8); SAVE_OUTPUT_SSE_8(6);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j); SAVE_OUTPUT_SSE_8(7);
output[l] += x;
} SAVE_OUTPUT_SSE_8_2(0);
SAVE_OUTPUT_SSE_8_2(1);
SAVE_OUTPUT_SSE_8_2(2);
SAVE_OUTPUT_SSE_8_2(3);
SAVE_OUTPUT_SSE_8_2(4);
SAVE_OUTPUT_SSE_8_2(5);
SAVE_OUTPUT_SSE_8_2(6);
SAVE_OUTPUT_SSE_8_2(7);
} }
for (int i=16*(in_len/16);i<in_len;i++) { for (int i=16*(in_len/16);i<in_len;i++) {
output[deinter[i%out_len]] += input[i]; output[deinter[i%out_len]] += input[i];
@ -584,16 +614,24 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
lutVal2 = _mm_loadu_si128(lutPtr); lutVal2 = _mm_loadu_si128(lutPtr);
lutPtr ++; lutPtr ++;
for (int j=0;j<8;j++) { SAVE_OUTPUT_SSE_8(0);
int8_t x = (int8_t) _mm_extract_epi8(xVal, j); SAVE_OUTPUT_SSE_8(1);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, j); SAVE_OUTPUT_SSE_8(2);
output[l] += x; SAVE_OUTPUT_SSE_8(3);
} SAVE_OUTPUT_SSE_8(4);
for (int j=0;j<8;j++) { SAVE_OUTPUT_SSE_8(5);
int8_t x = (int8_t) _mm_extract_epi8(xVal, j+8); SAVE_OUTPUT_SSE_8(6);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, j); SAVE_OUTPUT_SSE_8(7);
output[l] += x;
} SAVE_OUTPUT_SSE_8_2(0);
SAVE_OUTPUT_SSE_8_2(1);
SAVE_OUTPUT_SSE_8_2(2);
SAVE_OUTPUT_SSE_8_2(3);
SAVE_OUTPUT_SSE_8_2(4);
SAVE_OUTPUT_SSE_8_2(5);
SAVE_OUTPUT_SSE_8_2(6);
SAVE_OUTPUT_SSE_8_2(7);
intCnt += 16; intCnt += 16;
inputCnt += 16; inputCnt += 16;
if (intCnt >= out_len && inputCnt < in_len - 16) { if (intCnt >= out_len && inputCnt < in_len - 16) {
@ -635,9 +673,10 @@ int srslte_rm_turbo_rx_lut_sse_8bit(int8_t *input, int8_t *output, uint16_t *dei
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
#define SAVE_OUTPUT(j) x = (int16_t) _mm256_extract_epi16(xVal, j);\ #define SAVE_OUTPUT(j) \
l = (uint16_t) _mm256_extract_epi16(lutVal, j);\ x = (int16_t)_mm256_extract_epi16(xVal, j); \
output[l] += x; l = (uint16_t)_mm256_extract_epi16(lutVal, j); \
output[l] += x;
int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint16_t *deinter, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx)
{ {

@ -276,35 +276,50 @@ void srslte_vec_neg_bbb_simd(const int8_t *x, const int8_t *y, int8_t *z, const
} }
} }
#define SAVE_OUTPUT_16_SSE(j) \
x = (int16_t)_mm_extract_epi16(xVal, j); \
l = (uint16_t)_mm_extract_epi16(lutVal, j); \
y[l] = (short)x;
/* No improvement with AVX */ /* No improvement with AVX */
void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len) { void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y, const int len) {
int i = 0; int i = 0;
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#ifndef DEBUG_MODE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) { if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
for (; i < len - 7; i += 8) { for (; i < len - 7; i += 8) {
__m128i xVal = _mm_load_si128((__m128i *) &x[i]); __m128i xVal = _mm_load_si128((__m128i *) &x[i]);
__m128i lutVal = _mm_load_si128((__m128i *) &lut[i]); __m128i lutVal = _mm_load_si128((__m128i *) &lut[i]);
for (int k = 0; k < 8; k++) { int16_t x;
int16_t x = (int16_t) _mm_extract_epi16(xVal, k); uint16_t l;
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
y[l] = (short) x; SAVE_OUTPUT_16_SSE(0);
} SAVE_OUTPUT_16_SSE(1);
SAVE_OUTPUT_16_SSE(2);
SAVE_OUTPUT_16_SSE(3);
SAVE_OUTPUT_16_SSE(4);
SAVE_OUTPUT_16_SSE(5);
SAVE_OUTPUT_16_SSE(6);
SAVE_OUTPUT_16_SSE(7);
} }
} else { } else {
for (; i < len - 7; i += 8) { for (; i < len - 7; i += 8) {
__m128i xVal = _mm_loadu_si128((__m128i *) &x[i]); __m128i xVal = _mm_loadu_si128((__m128i *) &x[i]);
__m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]); __m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]);
for (int k = 0; k < 8; k++) { int16_t x;
int16_t x = (int16_t) _mm_extract_epi16(xVal, k); uint16_t l;
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
y[l] = (short) x; SAVE_OUTPUT_16_SSE(0);
} SAVE_OUTPUT_16_SSE(1);
SAVE_OUTPUT_16_SSE(2);
SAVE_OUTPUT_16_SSE(3);
SAVE_OUTPUT_16_SSE(4);
SAVE_OUTPUT_16_SSE(5);
SAVE_OUTPUT_16_SSE(6);
SAVE_OUTPUT_16_SSE(7);
} }
} }
#endif
#endif #endif
for (; i < len; i++) { for (; i < len; i++) {
@ -312,26 +327,45 @@ void srslte_vec_lut_sss_simd(const short *x, const unsigned short *lut, short *y
} }
} }
#define SAVE_OUTPUT_SSE_8(j) \
x = (int8_t)_mm_extract_epi8(xVal, j); \
l = (uint16_t)_mm_extract_epi16(lutVal1, j); \
y[l] = (char)x;
#define SAVE_OUTPUT_SSE_8_2(j) \
x = (int8_t)_mm_extract_epi8(xVal, j + 8); \
l = (uint16_t)_mm_extract_epi16(lutVal2, j); \
y[l] = (char)x;
void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t *y, const int len) { void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t *y, const int len) {
int i = 0; int i = 0;
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#ifndef DEBUG_MODE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) { if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
for (; i < len - 15; i += 16) { for (; i < len - 15; i += 16) {
__m128i xVal = _mm_load_si128((__m128i *) &x[i]); __m128i xVal = _mm_load_si128((__m128i *) &x[i]);
__m128i lutVal1 = _mm_load_si128((__m128i *) &lut[i]); __m128i lutVal1 = _mm_load_si128((__m128i *) &lut[i]);
__m128i lutVal2 = _mm_load_si128((__m128i *) &lut[i+8]); __m128i lutVal2 = _mm_load_si128((__m128i *) &lut[i+8]);
for (int k = 0; k < 8; k++) { int8_t x;
int8_t x = (int8_t) _mm_extract_epi8(xVal, k); uint16_t l;
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k);
y[l] = (char) x; SAVE_OUTPUT_SSE_8(0);
} SAVE_OUTPUT_SSE_8(1);
for (int k = 0; k < 8; k++) { SAVE_OUTPUT_SSE_8(2);
int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8); SAVE_OUTPUT_SSE_8(3);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k); SAVE_OUTPUT_SSE_8(4);
y[l] = (char) x; SAVE_OUTPUT_SSE_8(5);
} SAVE_OUTPUT_SSE_8(6);
SAVE_OUTPUT_SSE_8(7);
SAVE_OUTPUT_SSE_8_2(0);
SAVE_OUTPUT_SSE_8_2(1);
SAVE_OUTPUT_SSE_8_2(2);
SAVE_OUTPUT_SSE_8_2(3);
SAVE_OUTPUT_SSE_8_2(4);
SAVE_OUTPUT_SSE_8_2(5);
SAVE_OUTPUT_SSE_8_2(6);
SAVE_OUTPUT_SSE_8_2(7);
} }
} else { } else {
for (; i < len - 15; i += 16) { for (; i < len - 15; i += 16) {
@ -339,19 +373,28 @@ void srslte_vec_lut_bbb_simd(const int8_t *x, const unsigned short *lut, int8_t
__m128i lutVal1 = _mm_loadu_si128((__m128i *) &lut[i]); __m128i lutVal1 = _mm_loadu_si128((__m128i *) &lut[i]);
__m128i lutVal2 = _mm_loadu_si128((__m128i *) &lut[i+8]); __m128i lutVal2 = _mm_loadu_si128((__m128i *) &lut[i+8]);
for (int k = 0; k < 8; k++) { int8_t x;
int8_t x = (int8_t) _mm_extract_epi8(xVal, k); uint16_t l;
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal1, k);
y[l] = (char) x; SAVE_OUTPUT_SSE_8(0);
} SAVE_OUTPUT_SSE_8(1);
for (int k = 0; k < 8; k++) { SAVE_OUTPUT_SSE_8(2);
int8_t x = (int8_t) _mm_extract_epi8(xVal, k+8); SAVE_OUTPUT_SSE_8(3);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal2, k); SAVE_OUTPUT_SSE_8(4);
y[l] = (char) x; SAVE_OUTPUT_SSE_8(5);
} SAVE_OUTPUT_SSE_8(6);
SAVE_OUTPUT_SSE_8(7);
SAVE_OUTPUT_SSE_8_2(0);
SAVE_OUTPUT_SSE_8_2(1);
SAVE_OUTPUT_SSE_8_2(2);
SAVE_OUTPUT_SSE_8_2(3);
SAVE_OUTPUT_SSE_8_2(4);
SAVE_OUTPUT_SSE_8_2(5);
SAVE_OUTPUT_SSE_8_2(6);
SAVE_OUTPUT_SSE_8_2(7);
} }
} }
#endif
#endif #endif
for (; i < len; i++) { for (; i < len; i++) {

Loading…
Cancel
Save