novolk: use unaligned load/store SSE intrinsics, allow debug builds

master
Andre Puschmann 8 years ago
parent fbe2be98a5
commit 949d4b8df8

@ -327,8 +327,11 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len,
lutVal = _mm_loadu_si128(lutPtr);
for (int j=0;j<8;j++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, j);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
// For -O0 builds: shuffle j-th element to pos 0 and extract from there
_mm_shuffle_epi8(xVal,_mm_set1_epi8(j));
int16_t x = (int16_t) _mm_extract_epi16(xVal, 0);
_mm_shuffle_epi8(lutVal,_mm_set1_epi8(j));
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0);
output[l] += x;
}
xPtr ++;
@ -346,8 +349,10 @@ int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len,
lutVal = _mm_loadu_si128(lutPtr);
for (int j=0;j<8;j++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, j);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, j);
_mm_shuffle_epi8(xVal,_mm_set1_epi8(j));
int16_t x = (int16_t) _mm_extract_epi16(xVal, 0);
_mm_shuffle_epi8(lutVal,_mm_set1_epi8(j));
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0);
output[l] += x;
}
xPtr++;

@ -227,8 +227,10 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l
lutVal = _mm_load_si128(lutPtr);
for (int i=0;i<8;i++) {
int16_t x = (int16_t) _mm_extract_epi16(xVal, i);
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, i);
_mm_shuffle_epi8(xVal,_mm_set1_epi8(i));
int16_t x = (int16_t) _mm_extract_epi16(xVal, 0);
_mm_shuffle_epi8(lutVal,_mm_set1_epi8(i));
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, 0);
y[l] = x;
}
xPtr ++;
@ -295,12 +297,12 @@ void srslte_vec_sum_fff_simd(float *x, float *y, float *z, uint32_t len) {
__m128 xVal, yVal, zVal;
for(;number < points; number++){
xVal = _mm_load_ps(xPtr);
yVal = _mm_load_ps(yPtr);
xVal = _mm_loadu_ps(xPtr);
yVal = _mm_loadu_ps(yPtr);
zVal = _mm_add_ps(xVal, yVal);
_mm_store_ps(zPtr, zVal);
_mm_storeu_ps(zPtr, zVal);
xPtr += 4;
yPtr += 4;
@ -338,10 +340,10 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, uint32_t len)
const float* yPtr = (const float*) y;
for(; number < halfPoints; number++){
xVal = _mm_load_ps(xPtr);
yVal = _mm_load_ps(yPtr);
xVal = _mm_loadu_ps(xPtr);
yVal = _mm_loadu_ps(yPtr);
zVal = _mm_complexmul_ps(xVal, yVal);
_mm_store_ps(zPtr, zVal);
_mm_storeu_ps(zPtr, zVal);
xPtr += 4;
yPtr += 4;

Loading…
Cancel
Save