fixed bug in sub_sse() and added couple of avx functions

master
Ismael Gomez 8 years ago
parent 0dae4a00c4
commit f629e10fcf

@ -49,8 +49,12 @@ SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t l
SRSLTE_API void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len); SRSLTE_API void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len); SRSLTE_API void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sub_fff_avx(float *x, float *y, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sc_prod_fff_sse(float *x, float h, float *z, uint32_t len); SRSLTE_API void srslte_vec_sc_prod_fff_sse(float *x, float h, float *z, uint32_t len);
SRSLTE_API void srslte_vec_sc_prod_ccc_sse(cf_t *x, cf_t h, cf_t *z, uint32_t len); SRSLTE_API void srslte_vec_sc_prod_ccc_sse(cf_t *x, cf_t h, cf_t *z, uint32_t len);

@ -101,9 +101,13 @@ void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
for (i=0;i<len;i++) { for (i=0;i<len;i++) {
z[i] = x[i]-y[i]; z[i] = x[i]-y[i];
} }
#else
#ifdef LV_HAVE_AVX
srslte_vec_sub_fff_avx(x, y, z, len);
#else #else
srslte_vec_sub_fff_sse(x, y, z, len); srslte_vec_sub_fff_sse(x, y, z, len);
#endif #endif
#endif
} }
void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) { void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
@ -134,7 +138,11 @@ void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len) {
z[i] = x[i]+y[i]; z[i] = x[i]+y[i];
} }
#else #else
srslte_vec_sum_fff_sse(x, y, z, len); #ifdef LV_HAVE_AVX
srslte_vec_sum_fff_avx(x, y, z, len);
#else
srslte_vec_sum_fff_sse(x, y, z, len);
#endif
#endif #endif
} }

@ -501,6 +501,36 @@ void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len) {
#endif #endif
} }
void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) {
#ifdef LV_HAVE_AVX
unsigned int number = 0;
const unsigned int points = len / 8;
const float* xPtr = (const float*) x;
const float* yPtr = (const float*) y;
float* zPtr = (float*) z;
__m256 xVal, yVal, zVal;
for(;number < points; number++){
xVal = _mm256_loadu_ps(xPtr);
yVal = _mm256_loadu_ps(yPtr);
zVal = _mm256_add_ps(xVal, yVal);
_mm256_storeu_ps(zPtr, zVal);
xPtr += 8;
yPtr += 8;
zPtr += 8;
}
for(number = points * 8;number < len; number++){
z[number] = x[number] + y[number];
}
#endif
}
void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) { void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
unsigned int number = 0; unsigned int number = 0;
@ -525,14 +555,43 @@ void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) {
zPtr += 4; zPtr += 4;
} }
number = points * 4; for(number = points * 4;number < len; number++){
for(;number < len; number++){ z[number] = x[number] - y[number];
z[number] = x[number] + y[number];
} }
#endif #endif
} }
void srslte_vec_sub_fff_avx(float *x, float *y, float *z, uint32_t len) {
#ifdef LV_HAVE_SSE
unsigned int number = 0;
const unsigned int points = len / 8;
const float* xPtr = (const float*) x;
const float* yPtr = (const float*) y;
float* zPtr = (float*) z;
__m256 xVal, yVal, zVal;
for(;number < points; number++){
xVal = _mm256_loadu_ps(xPtr);
yVal = _mm256_loadu_ps(yPtr);
zVal = _mm256_sub_ps(xVal, yVal);
_mm256_storeu_ps(zPtr, zVal);
xPtr += 8;
yPtr += 8;
zPtr += 8;
}
for(number = points * 8;number < len; number++){
z[number] = x[number] - y[number];
}
#endif
}
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) { static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) {
__m128 yl, yh, tmp1, tmp2; __m128 yl, yh, tmp1, tmp2;

Loading…
Cancel
Save