From 949d4b8df84a1f7895c03375c50ce6ed77eac155 Mon Sep 17 00:00:00 2001 From: Andre Puschmann Date: Wed, 25 Jan 2017 10:18:27 +0100 Subject: [PATCH] novolk: use unaligned load/store SSE intrinsics, allow debug builds --- srslte/lib/fec/rm_turbo.c | 13 +++++++++---- srslte/lib/utils/vector_simd.c | 18 ++++++++++-------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/srslte/lib/fec/rm_turbo.c b/srslte/lib/fec/rm_turbo.c index 5856a4e46..751c9fe0c 100644 --- a/srslte/lib/fec/rm_turbo.c +++ b/srslte/lib/fec/rm_turbo.c @@ -327,8 +327,11 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, lutVal = _mm_loadu_si128(lutPtr); for (int j=0;j<8;j++) { - int16_t x = (int16_t) _mm_extract_epi16(xVal, j); - uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j); + // For -O0 builds: shuffle j-th element to pos 0 and extract from there + _mm_shuffle_epi8(xVal,_mm_set1_epi8(j)); + int16_t x = (int16_t) _mm_extract_epi16(xVal, 0); + _mm_shuffle_epi8(lutVal,_mm_set1_epi8(j)); + uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0); output[l] += x; } xPtr ++; @@ -346,8 +349,10 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, lutVal = _mm_loadu_si128(lutPtr); for (int j=0;j<8;j++) { - int16_t x = (int16_t) _mm_extract_epi16(xVal, j); - uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j); + _mm_shuffle_epi8(xVal,_mm_set1_epi8(j)); + int16_t x = (int16_t) _mm_extract_epi16(xVal, 0); + _mm_shuffle_epi8(lutVal,_mm_set1_epi8(j)); + uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0); output[l] += x; } xPtr++; diff --git a/srslte/lib/utils/vector_simd.c b/srslte/lib/utils/vector_simd.c index c150209ca..8d91c5f42 100644 --- a/srslte/lib/utils/vector_simd.c +++ b/srslte/lib/utils/vector_simd.c @@ -227,8 +227,10 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l lutVal = _mm_load_si128(lutPtr); for (int i=0;i<8;i++) { - int16_t x = (int16_t) _mm_extract_epi16(xVal, i); - uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, i); + _mm_shuffle_epi8(xVal,_mm_set1_epi8(i)); + int16_t x = (int16_t) _mm_extract_epi16(xVal, 0); + _mm_shuffle_epi8(lutVal,_mm_set1_epi8(i)); + uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0); y[l] = x; } xPtr ++; @@ -295,12 +297,12 @@ void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) { __m128 xVal, yVal, zVal; for(;number < points; number++){ - xVal = _mm_load_ps(xPtr); - yVal = _mm_load_ps(yPtr); + xVal = _mm_loadu_ps(xPtr); + yVal = _mm_loadu_ps(yPtr); zVal = _mm_add_ps(xVal, yVal); - _mm_store_ps(zPtr, zVal); + _mm_storeu_ps(zPtr, zVal); xPtr += 4; yPtr += 4; @@ -338,10 +340,10 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len) const float* yPtr = (const float*) y; for(; number < halfPoints; number++){ - xVal = _mm_load_ps(xPtr); - yVal = _mm_load_ps(yPtr); + xVal = _mm_loadu_ps(xPtr); + yVal = _mm_loadu_ps(yPtr); zVal = _mm_complexmul_ps(xVal, yVal); - _mm_store_ps(zPtr, zVal); + _mm_storeu_ps(zPtr, zVal); xPtr += 4; yPtr += 4;