Added Vector max abs SIMD function

7 years ago · f01f7b4945
parent e18ba937dc
commit f01f7b4945
5 changed files with 122 additions and 19 deletions
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@ -203,7 +203,7 @@ static inline simd_f_t srslte_simd_f_loadu(const float *ptr) {
 #ifdef LV_HAVE_SSE
  return _mm_loadu_ps(ptr);
 #else /* LV_HAVE_SSE */
-  #ifdef HAVE_NEON
+#ifdef HAVE_NEON
  return vld1q_f32(ptr);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
@ -396,7 +396,7 @@ static inline simd_f_t srslte_simd_f_swap(simd_f_t a) {
 #ifdef LV_HAVE_AVX2
  return _mm256_permute_ps(a, 0b10110001);
 #else /* LV_HAVE_AVX2 */
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  return _mm_shuffle_ps(a, a, 0b10110001);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
@ -428,7 +428,7 @@ static inline simd_f_t srslte_simd_f_hadd(simd_f_t a, simd_f_t b) {
  simd_f_t b1 = _mm256_permute2f128_ps(a, b, 0b00110001);
  return _mm256_hadd_ps(a1, b1);
 #else /* LV_HAVE_AVX2 */
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  return _mm_hadd_ps(a, b);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
@ -446,7 +446,7 @@ static inline simd_f_t srslte_simd_f_sqrt(simd_f_t a) {
 #ifdef LV_HAVE_AVX2
  return _mm256_sqrt_ps(a);
 #else /* LV_HAVE_AVX2 */
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  return _mm_sqrt_ps(a);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
@ -471,7 +471,7 @@ static inline simd_f_t srslte_simd_f_neg(simd_f_t a) {
 #ifdef LV_HAVE_AVX2
  return _mm256_xor_ps(_mm256_set1_ps(-0.0f), a);
 #else /* LV_HAVE_AVX2 */
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  return _mm_xor_ps(_mm_set1_ps(-0.0f), a);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
@ -489,7 +489,7 @@ static inline simd_f_t srslte_simd_f_neg_mask(simd_f_t a, simd_f_t mask) {
 #ifdef LV_HAVE_AVX2
  return _mm256_xor_ps(mask, a);
 #else /* LV_HAVE_AVX2 */
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  return _mm_xor_ps(mask, a);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
@ -500,6 +500,25 @@ static inline simd_f_t srslte_simd_f_neg_mask(simd_f_t a, simd_f_t mask) {
 #endif /* LV_HAVE_AVX512 */
 }

+static inline simd_f_t srslte_simd_f_abs(simd_f_t a) {
+#ifdef LV_HAVE_AVX512
+  return _mm512_andnot_ps(_mm512_set1_ps(-0.0f), a);
+#else /* LV_HAVE_AVX512 */
+#ifdef LV_HAVE_AVX2
+  return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a);
+#else /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_SSE
+  return _mm_andnot_ps(_mm_set1_ps(-0.0f), a);
+#else /* LV_HAVE_SSE */
+#ifdef HAVE_NEON
+  return vqabsq_s32(a);
+#endif /* HAVE_NEON */
+#endif /* LV_HAVE_SSE */
+#endif /* LV_HAVE_AVX2 */
+#endif /* LV_HAVE_AVX512 */
+}
+
+
 #endif /* SRSLTE_SIMD_F_SIZE */


@ -836,7 +855,7 @@ static inline simd_cf_t srslte_simd_cf_conjprod (simd_cf_t a, simd_cf_t b) {
                      _mm_mul_ps(a.im, b.im));
  ret.im = _mm_sub_ps(_mm_mul_ps(a.im, b.re),
                      _mm_mul_ps(a.re, b.im));
-  #else
+#else
 #ifdef HAVE_NEON
  ret.val[0] = vaddq_f32(vmulq_f32(a.val[0],b.val[0]),
                         vmulq_f32(a.val[1],b.val[1]));
@ -883,7 +902,7 @@ static inline simd_cf_t srslte_simd_cf_sub (simd_cf_t a, simd_cf_t b) {
  ret.re = _mm256_sub_ps(a.re, b.re);
  ret.im = _mm256_sub_ps(a.im, b.im);
 #else /* LV_HAVE_AVX2 */
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  ret.re = _mm_sub_ps(a.re, b.re);
  ret.im = _mm_sub_ps(a.im, b.im);
 #else /* LV_HAVE_SSE */
@ -942,7 +961,7 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
  ret.re = _mm256_mul_ps(a.re, rcp);
  ret.im = _mm256_mul_ps(neg_a_im, rcp);
 #else /* LV_HAVE_AVX2 */
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  simd_f_t a2re = _mm_mul_ps(a.re, a.re);
  simd_f_t a2im = _mm_mul_ps(a.im, a.im);
  simd_f_t mod2 = _mm_add_ps(a2re, a2im);
@ -951,7 +970,7 @@ static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
  ret.re = _mm_mul_ps(a.re, rcp);
  ret.im = _mm_mul_ps(neg_a_im, rcp);
 #else /* LV_HAVE_SSE */
-  #ifdef HAVE_NEON
+#ifdef HAVE_NEON
  simd_f_t a2re = vmulq_f32(a.val[0], a.val[0]);
  simd_f_t a2im = vmulq_f32(a.val[1], a.val[1]);
  simd_f_t mod2 = vaddq_f32(a2re, a2im);
@ -1074,10 +1093,10 @@ static inline simd_i_t srslte_simd_i_load(int *x) {
 #ifdef LV_HAVE_AVX2
  return _mm256_load_si256((__m256i*)x);
 #else
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  return _mm_load_si128((__m128i*)x);
 #else
-  #ifdef HAVE_NEON
+#ifdef HAVE_NEON
  return vld1q_s32((int*)x);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
@ -1110,10 +1129,10 @@ static inline simd_i_t srslte_simd_i_set1(int x) {
 #ifdef LV_HAVE_AVX2
  return _mm256_set1_epi32(x);
 #else
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  return _mm_set1_epi32(x);
 #else
-  #ifdef HAVE_NEON
+#ifdef HAVE_NEON
  return vdupq_n_s32(x);
 #endif /* HAVE_NEON */
 #endif /* LV_HAVE_SSE */
@ -1146,7 +1165,7 @@ static inline simd_sel_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX2
  return _mm256_cmp_ps(a, b, _CMP_GT_OS);
 #else /* LV_HAVE_AVX2 */
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  return  (simd_sel_t) _mm_cmpgt_ps(a, b);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON
@ -1164,7 +1183,7 @@ static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_sel_t s
 #ifdef LV_HAVE_AVX2
  return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b, selector);
 #else
-  #ifdef LV_HAVE_SSE
+#ifdef LV_HAVE_SSE
  return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, selector);
 #else /* LV_HAVE_SSE */
 #ifdef HAVE_NEON // CURRENTLY USES GENERIC IMPLEMENTATION FOR NEON
--- a/lib/include/srslte/phy/utils/vector_simd.h
+++ b/lib/include/srslte/phy/utils/vector_simd.h
@ -136,6 +136,8 @@ SRSLTE_API void srslte_vec_apply_cfo_simd(const cf_t *x, float cfo, cf_t *z, int
 /* SIMD Find Max functions */
 SRSLTE_API uint32_t srslte_vec_max_fi_simd(const float *x, const int len);

+SRSLTE_API uint32_t srslte_vec_max_abs_fi_simd(const float *x, const int len);
+
 SRSLTE_API uint32_t srslte_vec_max_ci_simd(const cf_t *x, const int len);

 #ifdef __cplusplus
--- a/lib/src/phy/utils/test/vector_test.c
+++ b/lib/src/phy/utils/test/vector_test.c
@ -48,8 +48,7 @@ bool verbose = false;
 #define MAX_FUNCTIONS (64)
 #define MAX_BLOCKS (16)

-
-#define RANDOM_F() ((float)rand())/((float)RAND_MAX)
+#define RANDOM_F() (((float) rand()) / ((float) RAND_MAX) * 2.0f - 1.0f)
 #define RANDOM_S() ((int16_t)(rand() & 0x800F))
 #define RANDOM_B() ((int8_t)(rand() & 0x8008))
 #define RANDOM_CF() (RANDOM_F() + _Complex_I*RANDOM_F())
@ -705,6 +704,29 @@ TEST(srslte_vec_max_fi,
         free(x);
 )

+TEST(srslte_vec_max_abs_fi,
+     MALLOC(float, x);
+
+         for (int i = 0; i < block_size; i++) {
+           x[i] = RANDOM_F();
+         }
+
+         uint32_t max_index = 0;
+         TEST_CALL(max_index = srslte_vec_max_abs_fi(x, block_size);)
+
+         float gold_value = -INFINITY;
+         uint32_t gold_index = 0;
+         for (int i = 0; i < block_size; i++) {
+           if (gold_value < fabsf(x[i])) {
+             gold_value = fabsf(x[i]);
+             gold_index = i;
+           }
+         }
+         mse = (gold_index != max_index) ? 1:0;
+
+         free(x);
+)
+
 TEST(srslte_vec_max_abs_ci,
     MALLOC(cf_t, x);

@ -899,6 +921,9 @@ int main(int argc, char **argv) {
    passed[func_count][size_count] = test_srslte_vec_max_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

+    passed[func_count][size_count] = test_srslte_vec_max_abs_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
+    func_count++;
+
    passed[func_count][size_count] = test_srslte_vec_max_abs_ci(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;

--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@ -369,7 +369,7 @@ uint32_t srslte_vec_max_fi(const float *x, const uint32_t len) {
 }

 uint32_t srslte_vec_max_abs_fi(const float *x, const uint32_t len) {
-  return srslte_vec_max_fi_simd(x, len);
+  return srslte_vec_max_abs_fi_simd(x, len);
 }

 // CP autocorr
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@ -1092,6 +1092,63 @@ uint32_t srslte_vec_max_fi_simd(const float *x, const int len) {
  return max_index;
 }

+uint32_t srslte_vec_max_abs_fi_simd(const float *x, const int len) {
+  int i = 0;
+
+  float max_value = -INFINITY;
+  uint32_t max_index = 0;
+
+#if SRSLTE_SIMD_I_SIZE
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
+  simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
+  simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
+  simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
+
+  simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
+
+  if (SRSLTE_IS_ALIGNED(x)) {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t a = srslte_simd_f_abs(srslte_simd_f_load(&x[i]));
+      simd_sel_t res = srslte_simd_f_max(a, simd_max_values);
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  } else {
+    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
+      simd_f_t a = srslte_simd_f_abs(srslte_simd_f_loadu(&x[i]));
+      simd_sel_t res = srslte_simd_f_max(a, simd_max_values);
+      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
+      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
+      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
+    }
+  }
+
+  srslte_simd_i_store(indexes_buffer, simd_max_indexes);
+  srslte_simd_f_store(values_buffer, simd_max_values);
+
+  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
+    if (values_buffer[k] > max_value) {
+      max_value = values_buffer[k];
+      max_index = (uint32_t) indexes_buffer[k];
+    }
+  }
+#endif /* SRSLTE_SIMD_I_SIZE */
+
+  for (; i < len; i++) {
+    float a = fabsf(x[i]);
+    if (a > max_value) {
+      max_value = a;
+      max_index = (uint32_t)i;
+    }
+  }
+
+  return max_index;
+}
+
 uint32_t srslte_vec_max_ci_simd(const cf_t *x, const int len) {
  int i = 0;