diff --git a/lib/include/srslte/phy/utils/simd.h b/lib/include/srslte/phy/utils/simd.h index a7809136e..2a7566e18 100644 --- a/lib/include/srslte/phy/utils/simd.h +++ b/lib/include/srslte/phy/utils/simd.h @@ -203,7 +203,7 @@ static inline simd_f_t srslte_simd_f_loadu(const float *ptr) { #ifdef LV_HAVE_SSE return _mm_loadu_ps(ptr); #else /* LV_HAVE_SSE */ - #ifdef HAVE_NEON +#ifdef HAVE_NEON return vld1q_f32(ptr); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ @@ -396,7 +396,7 @@ static inline simd_f_t srslte_simd_f_swap(simd_f_t a) { #ifdef LV_HAVE_AVX2 return _mm256_permute_ps(a, 0b10110001); #else /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_shuffle_ps(a, a, 0b10110001); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON @@ -428,7 +428,7 @@ static inline simd_f_t srslte_simd_f_hadd(simd_f_t a, simd_f_t b) { simd_f_t b1 = _mm256_permute2f128_ps(a, b, 0b00110001); return _mm256_hadd_ps(a1, b1); #else /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_hadd_ps(a, b); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON @@ -446,7 +446,7 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) { #ifdef LV_HAVE_AVX2 return _mm256_sqrt_ps(a); #else /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_sqrt_ps(a); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON @@ -471,7 +471,7 @@ static inline simd_f_t srslte_simd_f_neg(simd_f_t a) { #ifdef LV_HAVE_AVX2 return _mm256_xor_ps(_mm256_set1_ps(-0.0f), a); #else /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_xor_ps(_mm_set1_ps(-0.0f), a); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON @@ -489,7 +489,7 @@ static inline simd_f_t srslte_simd_f_neg_mask(simd_f_t a, simd_f_t mask) { #ifdef LV_HAVE_AVX2 return _mm256_xor_ps(mask, a); #else /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_xor_ps(mask, a); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON @@ -500,6 +500,25 @@ static inline simd_f_t srslte_simd_f_neg_mask(simd_f_t a, simd_f_t mask) { #endif /* LV_HAVE_AVX512 */ } +static inline simd_f_t srslte_simd_f_abs(simd_f_t a) { +#ifdef LV_HAVE_AVX512 + return _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a); +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX2 + return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a); +#else /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_SSE + return _mm_andnot_ps(_mm_set1_ps(-0.0f), a); +#else /* LV_HAVE_SSE */ +#ifdef HAVE_NEON + return vqabsq_s32(a); +#endif /* HAVE_NEON */ +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX512 */ +} + + #endif /* SRSLTE_SIMD_F_SIZE */ @@ -836,7 +855,7 @@ static inline simd_cf_t srslte_simd_cf_conjprod (simd_cf_t a, simd_cf_t b) { _mm_mul_ps(a.im, b.im)); ret.im = _mm_sub_ps(_mm_mul_ps(a.im, b.re), _mm_mul_ps(a.re, b.im)); - #else +#else #ifdef HAVE_NEON ret.val[0] = vaddq_f32(vmulq_f32(a.val[0],b.val[0]), vmulq_f32(a.val[1],b.val[1])); @@ -883,7 +902,7 @@ static inline simd_cf_t srslte_simd_cf_sub (simd_cf_t a, simd_cf_t b) { ret.re = _mm256_sub_ps(a.re, b.re); ret.im = _mm256_sub_ps(a.im, b.im); #else /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE ret.re = _mm_sub_ps(a.re, b.re); ret.im = _mm_sub_ps(a.im, b.im); #else /* LV_HAVE_SSE */ @@ -942,7 +961,7 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) { ret.re = _mm256_mul_ps(a.re, rcp); ret.im = _mm256_mul_ps(neg_a_im, rcp); #else /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE simd_f_t a2re = _mm_mul_ps(a.re, a.re); simd_f_t a2im = _mm_mul_ps(a.im, a.im); simd_f_t mod2 = _mm_add_ps(a2re, a2im); @@ -951,7 +970,7 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) { ret.re = _mm_mul_ps(a.re, rcp); ret.im = _mm_mul_ps(neg_a_im, rcp); #else /* LV_HAVE_SSE */ - #ifdef HAVE_NEON +#ifdef HAVE_NEON simd_f_t a2re = vmulq_f32(a.val[0], a.val[0]); simd_f_t a2im = vmulq_f32(a.val[1], a.val[1]); simd_f_t mod2 = vaddq_f32(a2re, a2im); @@ -1074,10 +1093,10 @@ static inline simd_i_t srslte_simd_i_load(int *x) { #ifdef LV_HAVE_AVX2 return _mm256_load_si256((__m256i*)x); #else - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_load_si128((__m128i*)x); #else - #ifdef HAVE_NEON +#ifdef HAVE_NEON return vld1q_s32((int*)x); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ @@ -1110,10 +1129,10 @@ static inline simd_i_t srslte_simd_i_set1(int x) { #ifdef LV_HAVE_AVX2 return _mm256_set1_epi32(x); #else - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return _mm_set1_epi32(x); #else - #ifdef HAVE_NEON +#ifdef HAVE_NEON return vdupq_n_s32(x); #endif /* HAVE_NEON */ #endif /* LV_HAVE_SSE */ @@ -1146,7 +1165,7 @@ static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) { #ifdef LV_HAVE_AVX2 return _mm256_cmp_ps(a, b, _CMP_GT_OS); #else /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return (simd_sel_t) _mm_cmpgt_ps(a, b); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON @@ -1164,7 +1183,7 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s #ifdef LV_HAVE_AVX2 return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b, selector); #else - #ifdef LV_HAVE_SSE +#ifdef LV_HAVE_SSE return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, selector); #else /* LV_HAVE_SSE */ #ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h index fae19aa02..68ddbdee0 100644 --- a/lib/include/srslte/phy/utils/vector_simd.h +++ b/lib/include/srslte/phy/utils/vector_simd.h @@ -136,6 +136,8 @@ SRSLTE_API void srslte_vec_apply_cfo_simd(const cf_t *x, float cfo, cf_t *z, int /* SIMD Find Max functions */ SRSLTE_API uint32_t srslte_vec_max_fi_simd(const float *x, const int len); +SRSLTE_API uint32_t srslte_vec_max_abs_fi_simd(const float *x, const int len); + SRSLTE_API uint32_t srslte_vec_max_ci_simd(const cf_t *x, const int len); #ifdef __cplusplus diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c index 30f80c07d..85b1c3fb9 100644 --- a/lib/src/phy/utils/test/vector_test.c +++ b/lib/src/phy/utils/test/vector_test.c @@ -48,8 +48,7 @@ bool verbose = false; #define MAX_FUNCTIONS (64) #define MAX_BLOCKS (16) - -#define RANDOM_F() ((float)rand())/((float)RAND_MAX) +#define RANDOM_F() (((float) rand()) / ((float) RAND_MAX) * 2.0f - 1.0f) #define RANDOM_S() ((int16_t)(rand() & 0x800F)) #define RANDOM_B() ((int8_t)(rand() & 0x8008)) #define RANDOM_CF() (RANDOM_F() + _Complex_I*RANDOM_F()) @@ -705,6 +704,29 @@ TEST(srslte_vec_max_fi, free(x); ) +TEST(srslte_vec_max_abs_fi, + MALLOC(float, x); + + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_F(); + } + + uint32_t max_index = 0; + TEST_CALL(max_index = srslte_vec_max_abs_fi(x, block_size);) + + float gold_value = -INFINITY; + uint32_t gold_index = 0; + for (int i = 0; i < block_size; i++) { + if (gold_value < fabsf(x[i])) { + gold_value = fabsf(x[i]); + gold_index = i; + } + } + mse = (gold_index != max_index) ? 1:0; + + free(x); +) + TEST(srslte_vec_max_abs_ci, MALLOC(cf_t, x); @@ -899,6 +921,9 @@ int main(int argc, char **argv) { passed[func_count][size_count] = test_srslte_vec_max_fi(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; + passed[func_count][size_count] = test_srslte_vec_max_abs_fi(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + passed[func_count][size_count] = test_srslte_vec_max_abs_ci(func_names[func_count], &timmings[func_count][size_count], block_size); func_count++; diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c index ff63b09ce..9c1b84fdb 100644 --- a/lib/src/phy/utils/vector.c +++ b/lib/src/phy/utils/vector.c @@ -369,7 +369,7 @@ uint32_t srslte_vec_max_fi(const float *x, const uint32_t len) { } uint32_t srslte_vec_max_abs_fi(const float *x, const uint32_t len) { - return srslte_vec_max_fi_simd(x, len); + return srslte_vec_max_abs_fi_simd(x, len); } // CP autocorr diff --git a/lib/src/phy/utils/vector_simd.c b/lib/src/phy/utils/vector_simd.c index 2835a58cf..c99f2910c 100644 --- a/lib/src/phy/utils/vector_simd.c +++ b/lib/src/phy/utils/vector_simd.c @@ -1092,6 +1092,63 @@ uint32_t srslte_vec_max_fi_simd(const float *x, const int len) { return max_index; } +uint32_t srslte_vec_max_abs_fi_simd(const float *x, const int len) { + int i = 0; + + float max_value = -INFINITY; + uint32_t max_index = 0; + +#if SRSLTE_SIMD_I_SIZE + __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0}; + __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0}; + + for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k; + simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE); + simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer); + simd_i_t simd_max_indexes = srslte_simd_i_set1(0); + + simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY); + + if (SRSLTE_IS_ALIGNED(x)) { + for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) { + simd_f_t a = srslte_simd_f_abs(srslte_simd_f_load(&x[i])); + simd_sel_t res = srslte_simd_f_max(a, simd_max_values); + simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res); + simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res); + simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc); + } + } else { + for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) { + simd_f_t a = srslte_simd_f_abs(srslte_simd_f_loadu(&x[i])); + simd_sel_t res = srslte_simd_f_max(a, simd_max_values); + simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res); + simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res); + simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc); + } + } + + srslte_simd_i_store(indexes_buffer, simd_max_indexes); + srslte_simd_f_store(values_buffer, simd_max_values); + + for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) { + if (values_buffer[k] > max_value) { + max_value = values_buffer[k]; + max_index = (uint32_t) indexes_buffer[k]; + } + } +#endif /* SRSLTE_SIMD_I_SIZE */ + + for (; i < len; i++) { + float a = fabsf(x[i]); + if (a > max_value) { + max_value = a; + max_index = (uint32_t)i; + } + } + + return max_index; +} + uint32_t srslte_vec_max_ci_simd(const cf_t *x, const int len) { int i = 0;