Added more functions

7 years ago · 9e5f999666
parent c41ad5453c
commit 9e5f999666
9 changed files with 934 additions and 289 deletions
--- a/lib/include/srslte/phy/utils/simd.h
+++ b/lib/include/srslte/phy/utils/simd.h
@ -88,6 +88,8 @@
 #define SRSLTE_SIMD_F_SIZE    16
 #define SRSLTE_SIMD_CF_SIZE   16
 #define SRSLTE_SIMD_I_SIZE    16
 #define SRSLTE_SIMD_S_SIZE    32
 #define SRSLTE_SIMD_C16_SIZE  0
@ -97,6 +99,8 @@
 #define SRSLTE_SIMD_F_SIZE    8
 #define SRSLTE_SIMD_CF_SIZE   8
 #define SRSLTE_SIMD_I_SIZE    8
 #define SRSLTE_SIMD_S_SIZE    16
 #define SRSLTE_SIMD_C16_SIZE  16
@ -106,6 +110,8 @@
 #define SRSLTE_SIMD_F_SIZE    4
 #define SRSLTE_SIMD_CF_SIZE   4
 #define SRSLTE_SIMD_I_SIZE    4
 #define SRSLTE_SIMD_S_SIZE    8
 #define SRSLTE_SIMD_C16_SIZE  8
@ -114,6 +120,8 @@
 #define SRSLTE_SIMD_F_SIZE    0
 #define SRSLTE_SIMD_CF_SIZE   0
 #define SRSLTE_SIMD_I_SIZE    0
 #define SRSLTE_SIMD_S_SIZE    0
 #define SRSLTE_SIMD_C16_SIZE  0
@ -223,6 +231,20 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) {
 #endif /* LV_HAVE_AVX512 */
 }
 static inline simd_f_t srslte_simd_f_rcp(simd_f_t a) {
 #ifdef LV_HAVE_AVX512
  return _mm512_rcp_ps(a);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
  return _mm256_rcp_ps(a);
 #else /* LV_HAVE_AVX2 */
  #ifdef LV_HAVE_SSE
  return _mm_rcp_ps(a);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
 static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX512
  __m512 r = _mm512_add_ps(a, b);
@ -600,6 +622,61 @@ static inline simd_cf_t srslte_simd_cf_add (simd_cf_t a, simd_cf_t b) {
  return ret;
 }
 static inline simd_cf_t srslte_simd_cf_mul (simd_cf_t a, simd_f_t b) {
    simd_cf_t ret;
 #ifdef LV_HAVE_AVX512
  b = _mm512_permutexvar_ps(b, _mm512_setr_epi32(0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
  ret.re = _mm512_mul_ps(a.re, b);
  ret.im = _mm512_mul_ps(a.im, b);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
  b = _mm256_permutevar8x32_ps(b, _mm256_setr_epi32(0,4,1,5,2,6,3,7));
  ret.re = _mm256_mul_ps(a.re, b);
  ret.im = _mm256_mul_ps(a.im, b);
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
  ret.re = _mm_mul_ps(a.re, b);
  ret.im = _mm_mul_ps(a.im, b);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
    return ret;
 }
 static inline simd_cf_t srslte_simd_cf_rcp (simd_cf_t a) {
  simd_cf_t ret;
 #ifdef LV_HAVE_AVX512
  simd_f_t a2re = _mm512_mul_ps(a.re, a.re);
  simd_f_t a2im = _mm512_mul_ps(a.im, a.im);
  simd_f_t mod2 = _mm512_add_ps(a2re, a2im);
  simd_f_t rcp = _mm512_rcp_ps(mod2);
  simd_f_t neg_a_im = _mm512_xor_ps(_mm512_set1_ps(-0.0f), a.im);
  ret.re = _mm512_mul_ps(a.re, rcp);
  ret.im = _mm512_mul_ps(neg_a_im, rcp);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
  simd_f_t a2re = _mm256_mul_ps(a.re, a.re);
  simd_f_t a2im = _mm256_mul_ps(a.im, a.im);
  simd_f_t mod2 = _mm256_add_ps(a2re, a2im);
  simd_f_t rcp = _mm256_rcp_ps(mod2);
  simd_f_t neg_a_im = _mm256_xor_ps(_mm256_set1_ps(-0.0f), a.im);
  ret.re = _mm256_mul_ps(a.re, rcp);
  ret.im = _mm256_mul_ps(neg_a_im, rcp);
 #else /* LV_HAVE_AVX2 */
  #ifdef LV_HAVE_SSE
  simd_f_t a2re = _mm_mul_ps(a.re, a.re);
  simd_f_t a2im = _mm_mul_ps(a.im, a.im);
  simd_f_t mod2 = _mm_add_ps(a2re, a2im);
  simd_f_t rcp = _mm_rcp_ps(mod2);
  simd_f_t neg_a_im = _mm_xor_ps(_mm_set1_ps(-0.0f), a.im);
  ret.re = _mm_mul_ps(a.re, rcp);
  ret.im = _mm_mul_ps(neg_a_im, rcp);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
  return ret;
 }
 static inline simd_cf_t srslte_simd_cf_zero (void) {
  simd_cf_t ret;
 #ifdef LV_HAVE_AVX512
@ -621,6 +698,106 @@ static inline simd_cf_t srslte_simd_cf_zero (void) {
 #endif /* SRSLTE_SIMD_CF_SIZE */
 #if SRSLTE_SIMD_I_SIZE
 #ifdef LV_HAVE_AVX512
 typedef __m512i simd_i_t;
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
 typedef __m256i simd_i_t;
 #else /* LV_HAVE_AVX2 */
 #ifdef LV_HAVE_SSE
 typedef __m128i simd_i_t;
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 static inline simd_i_t srslte_simd_i_load(int *x) {
 #ifdef LV_HAVE_AVX512
  return _mm512_load_epi32((__m512i*)x);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
  return _mm256_load_si256((__m256i*)x);
 #else
  #ifdef LV_HAVE_SSE
  return _mm_load_si128((__m128i*)x);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
 static inline void srslte_simd_i_store(int *x, simd_i_t reg) {
 #ifdef LV_HAVE_AVX512
  _mm512_store_epi32((__m512i*)x, reg);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
  _mm256_store_si256((__m256i*)x, reg);
 #else
 #ifdef LV_HAVE_SSE
  _mm_store_si128((__m128i*)x, reg);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
 static inline simd_i_t srslte_simd_i_set1(int x) {
 #ifdef LV_HAVE_AVX512
  return _mm512_set1_epi32(x);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
  return _mm256_set1_epi32(x);
 #else
  #ifdef LV_HAVE_SSE
  return _mm_set1_epi32(x);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
 static inline simd_i_t srslte_simd_i_add(simd_i_t a, simd_i_t b) {
 #ifdef LV_HAVE_AVX512
  return _mm512_add_epi32(a, b);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
  return _mm256_add_epi32(a, b);
 #else
 #ifdef LV_HAVE_SSE
  return _mm_add_epi32(a, b);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
 static inline simd_i_t srslte_simd_f_max(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX512
  return (simd_i_t) _mm512_cmp_ps_mask(a, b, _CMP_GT_OS);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
  return (simd_i_t) _mm256_cmp_ps(a, b, _CMP_GT_OS);
 #else /* LV_HAVE_AVX2 */
  #ifdef LV_HAVE_SSE
  return  (simd_i_t) _mm_cmpgt_ps(a, b);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
 static inline simd_i_t srslte_simd_i_select(simd_i_t a, simd_i_t b, simd_i_t selector) {
 #ifdef LV_HAVE_AVX512
  return (__m512i) _mm512_blendv_ps((__m512)a, (__m512) b, (__m512) selector);
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX2
  return (__m256i) _mm256_blendv_ps((__m256) a,(__m256) b,(__m256) selector);
 #else
  #ifdef LV_HAVE_SSE
  return (__m128i) _mm_blendv_ps((__m128)a, (__m128)b, (__m128)selector);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 #endif /* LV_HAVE_AVX512 */
 }
 #endif /* SRSLTE_SIMD_I_SIZE*/
 #if SRSLTE_SIMD_S_SIZE
@ -829,6 +1006,20 @@ static inline simd_c16_t srslte_simd_c16_load(int16_t *re, int16_t *im) {
  return ret;
 }
 static inline simd_c16_t srslte_simd_c16_loadu(int16_t *re, int16_t *im) {
  simd_c16_t ret;
 #ifdef LV_HAVE_AVX2
  ret.re.m256 = _mm256_loadu_si256((__m256i*)(re));
  ret.im.m256 = _mm256_loadu_si256((__m256i*)(im));
 #else
 #ifdef LV_HAVE_SSE
  ret.re.m128 = _mm_loadu_si128((__m128i*)(re));
  ret.im.m128 = _mm_loadu_si128((__m128i*)(im));
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
  return ret;
 }
 static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
 #ifdef LV_HAVE_AVX2
  __m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001);
@ -845,6 +1036,22 @@ static inline void srslte_simd_c16i_store(c16_t *ptr, simd_c16_t simdreg) {
 #endif /* LV_HAVE_AVX2 */
 }
 static inline void srslte_simd_c16i_storeu(c16_t *ptr, simd_c16_t simdreg) {
 #ifdef LV_HAVE_AVX2
  __m256i re_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.re.m256, 0b10110001), 0b10110001);
  __m256i im_sw = _mm256_shufflelo_epi16(_mm256_shufflehi_epi16(simdreg.im.m256, 0b10110001), 0b10110001);
  _mm256_storeu_si256((__m256i *) (ptr), _mm256_blend_epi16(simdreg.re.m256, im_sw, 0b10101010));
  _mm256_storeu_si256((__m256i *) (ptr + 8), _mm256_blend_epi16(re_sw, simdreg.im.m256, 0b10101010));
 #else
 #ifdef LV_HAVE_SSE
  __m128i re_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.re.m128, 0b10110001), 0b10110001);
  __m128i im_sw = _mm_shufflelo_epi16(_mm_shufflehi_epi16(simdreg.im.m128, 0b10110001), 0b10110001);
  _mm_storeu_si128((__m128i *) (ptr), _mm_blend_epi16(simdreg.re.m128, im_sw, 0b10101010));
  _mm_storeu_si128((__m128i *) (ptr + 8), _mm_blend_epi16(re_sw, simdreg.im.m128, 0b10101010));
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 }
 static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t simdreg) {
 #ifdef LV_HAVE_AVX2
  _mm256_store_si256((__m256i *) re, simdreg.re.m256);
@ -857,6 +1064,18 @@ static inline void srslte_simd_c16_store(int16_t *re, int16_t *im, simd_c16_t si
 #endif /* LV_HAVE_AVX2 */
 }
 static inline void srslte_simd_c16_storeu(int16_t *re, int16_t *im, simd_c16_t simdreg) {
 #ifdef LV_HAVE_AVX2
  _mm256_storeu_si256((__m256i *) re, simdreg.re.m256);
  _mm256_storeu_si256((__m256i *) im, simdreg.im.m256);
 #else
 #ifdef LV_HAVE_SSE
  _mm_storeu_si128((__m128i *) re, simdreg.re.m128);
  _mm_storeu_si128((__m128i *) im, simdreg.im.m128);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 }
 static inline simd_c16_t srslte_simd_c16_prod (simd_c16_t a, simd_c16_t b) {
  simd_c16_t ret;
 #ifdef LV_HAVE_AVX2
@ -905,6 +1124,24 @@ static inline simd_c16_t srslte_simd_c16_zero (void) {
 #endif /* SRSLTE_SIMD_C16_SIZE */
 #if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE
 static inline simd_s_t srslte_simd_convert_2f_s(simd_f_t a, simd_f_t b) {
 #ifdef LV_HAVE_AVX2
  __m256 aa = _mm256_permute2f128_ps(a, b, 0x20);
  __m256 bb = _mm256_permute2f128_ps(a, b, 0x31);
  __m256i ai = _mm256_cvttps_epi32(aa);
  __m256i bi = _mm256_cvttps_epi32(bb);
  return _mm256_packs_epi32(ai, bi);
 #else
 #ifdef LV_HAVE_SSE
  __m128i ai = _mm_cvttps_epi32(a);
  __m128i bi = _mm_cvttps_epi32(b);
  return _mm_packs_epi32(ai, bi);
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX2 */
 }
 #endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_C16_SIZE */
 #endif //SRSLTE_SIMD_H_H
--- a/lib/include/srslte/phy/utils/vector.h
+++ b/lib/include/srslte/phy/utils/vector.h
@ -123,6 +123,7 @@ SRSLTE_API void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint
 /* vector product (element-wise) */
 SRSLTE_API void srslte_vec_prod_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_prod_ccc_split(float *x_re, float *x_im, float *y_re, float *y_im, float *z_re, float *z_im, uint32_t len);
 /* vector product (element-wise) */
 SRSLTE_API void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len);
@ -142,8 +143,8 @@ SRSLTE_API float srslte_vec_dot_prod_fff(float *x, float *y, uint32_t len);
 SRSLTE_API int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len); 
 /* z=x/y vector division (element-wise) */
-SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len);
+SRSLTE_API void srslte_vec_div_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
-void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag, uint32_t len);
+SRSLTE_API void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len);
 /* conjugate */
--- a/lib/include/srslte/phy/utils/vector_simd.h
+++ b/lib/include/srslte/phy/utils/vector_simd.h
@ -36,26 +36,29 @@ extern "C" {
 #include "srslte/config.h"
 #ifdef LV_HAVE_AVX512
 #define SRSLTE_SIMD_BIT_ALIGN 512
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x3F) == 0)
 #else /* LV_HAVE_AVX512 */
 #ifdef LV_HAVE_AVX
 #define SRSLTE_SIMD_BIT_ALIGN 256
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x1F) == 0)
 #else /* LV_HAVE_AVX */
 #ifdef LV_HAVE_SSE
 #define SRSLTE_SIMD_BIT_ALIGN 128
 #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0)
 #else /* LV_HAVE_SSE */
 #define SRSLTE_SIMD_BIT_ALIGN 64
 #define SRSLTE_IS_ALIGNED(PTR) (1)
 #endif /* LV_HAVE_SSE */
 #endif /* LV_HAVE_AVX */
 #endif /* LV_HAVE_AVX512 */
-SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len);
+/* SIMD Basic vector math */
 SRSLTE_API void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
 SRSLTE_API void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
-SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API float srslte_vec_acc_ff_simd(float *x, int len);
 SRSLTE_API cf_t srslte_vec_acc_cc_simd(cf_t *x, int len);
@ -63,58 +66,62 @@ SRSLTE_API void srslte_vec_add_fff_simd(float *x, float *y, float *z, int len);
 SRSLTE_API void srslte_vec_sub_fff_simd(float *x, float *y, float *z, int len);
 /* SIMD Vector Scalar Product */
 SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len);
 SRSLTE_API void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len);
 SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len);
 /* SIMD Vector Product */
 SRSLTE_API void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len);
 SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
                                             int16_t *r_im, int len);
 SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
 SRSLTE_API void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len);
 SRSLTE_API void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len);
 SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
 SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
-SRSLTE_API void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len);
+/* SIMD Division */
 SRSLTE_API void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len);
-SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re,
+SRSLTE_API void srslte_vec_div_cfc_simd(cf_t *x, float *y, cf_t *z, int len);
                                             int16_t *r_im, int len);
-SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len);
+SRSLTE_API void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len);
 /* SIMD Dot product */
 SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len);
 SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, int len);
 SRSLTE_API cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len);
 SRSLTE_API c16_t srslte_vec_dot_prod_ccc_c16i_simd(c16_t *x, c16_t *y, int len);
-SRSLTE_API  void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len);
+SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len);
 /* SIMD Modulus functions */
 SRSLTE_API void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len);
 SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, int len);
-SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len);
+/* Other Functions */
-
+SRSLTE_API void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len);
 SRSLTE_API void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len);
 SRSLTE_API void srslte_vec_sc_div2_sss_sse(short *x, int n_rightshift, short *z, uint32_t len); 
-SRSLTE_API  void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len);
+SRSLTE_API void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len);
-SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len); 
+SRSLTE_API void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len);
 SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len); 
 SRSLTE_API void srslte_vec_mult_scalar_cf_f_avx( cf_t *z,const cf_t *x,const float h,const uint32_t len);
 SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len);
 SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len);
-SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len);
+/* SIMD Find Max functions */
 SRSLTE_API uint32_t srslte_vec_max_fi_simd(float *x, int len);
-SRSLTE_API void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len);
+SRSLTE_API uint32_t srslte_vec_max_ci_simd(cf_t *x, int len);
 #ifdef __cplusplus
 }
--- a/lib/src/phy/mimo/precoding.c
+++ b/lib/src/phy/mimo/precoding.c
@ -36,17 +36,16 @@
 #ifdef LV_HAVE_SSE
 #include <immintrin.h>
 #include "srslte/phy/utils/mat.h"
 int srslte_predecoding_single_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
 int srslte_predecoding_diversity2_sse(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS][SRSLTE_MAX_PORTS], cf_t *x[SRSLTE_MAX_LAYERS], int nof_rxant, int nof_symbols);
 #endif
 #ifdef LV_HAVE_AVX
 #include <immintrin.h>
 #include "srslte/phy/utils/mat.h"
 int srslte_predecoding_single_avx(cf_t *y[SRSLTE_MAX_PORTS], cf_t *h[SRSLTE_MAX_PORTS], cf_t *x, int nof_rxant, int nof_symbols, float noise_estimate);
 #endif
 #include "srslte/phy/utils/mat.h"
 static srslte_mimo_decoder_t mimo_decoder = SRSLTE_MIMO_DECODER_MMSE;
--- a/lib/src/phy/sync/find_sss.c
+++ b/lib/src/phy/sync/find_sss.c
@ -70,14 +70,12 @@ static void corr_all_sz_partial(cf_t z[SRSLTE_SSS_N], float s[SRSLTE_SSS_N][SRSL
 static void extract_pair_sss(srslte_sss_synch_t *q, cf_t *input, cf_t *ce, cf_t y[2][SRSLTE_SSS_N]) {
  cf_t input_fft[SRSLTE_SYMBOL_SZ_MAX];
-  float ce_mod[2*SRSLTE_SSS_N], z_real[2*SRSLTE_SSS_N], z_imag[2*SRSLTE_SSS_N];
+
  srslte_dft_run_c(&q->dftp_input, input, input_fft);
  if (ce) {
-    srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce, ce_mod, 
+    srslte_vec_div_ccc(&input_fft[q->fft_size/2-SRSLTE_SSS_N], ce,
-                       &input_fft[q->fft_size/2-SRSLTE_SSS_N], z_real, z_imag,
+                       &input_fft[q->fft_size/2-SRSLTE_SSS_N], 2*SRSLTE_SSS_N);
                       2*SRSLTE_SSS_N);
  }
  for (int i = 0; i < SRSLTE_SSS_N; i++) {
--- a/lib/src/phy/utils/test/CMakeLists.txt
+++ b/lib/src/phy/utils/test/CMakeLists.txt
@ -44,4 +44,5 @@ add_test(algebra_2x2_zf_solver_test algebra_test -z)
 add_test(algebra_2x2_mmse_solver_test algebra_test -m)
 add_executable(vector_test vector_test.c)
-target_link_libraries(vector_test srslte_phy)
+target_link_libraries(vector_test srslte_phy)
 add_test(vector_test vector_test)
--- a/lib/src/phy/utils/test/vector_test.c
+++ b/lib/src/phy/utils/test/vector_test.c
@ -89,6 +89,26 @@ float squared_error (cf_t a, cf_t b) {
  return diff_re*diff_re + diff_im*diff_im;
 }
 TEST(srslte_vec_acc_ff,
     MALLOC(float, x);
         float z;
         cf_t gold = 0.0f;
         for (int i = 0; i < block_size; i++) {
           x[i] = RANDOM_F();
         }
         TEST_CALL(z = srslte_vec_acc_ff(x, block_size))
         for (int i = 0; i < block_size; i++) {
           gold += x[i];
         }
         mse += fabs(gold - z) / gold;
         free(x);
 )
 TEST(srslte_vec_dot_prod_sss,
     MALLOC(int16_t, x);
         MALLOC(int16_t, y);
@ -314,6 +334,37 @@ TEST(srslte_vec_prod_ccc,
  free(z);
 )
 TEST(srslte_vec_prod_ccc_split,
     MALLOC(float, x_re);
     MALLOC(float, x_im);
     MALLOC(float, y_re);
     MALLOC(float, y_im);
     MALLOC(float, z_re);
     MALLOC(float, z_im);
         cf_t gold;
         for (int i = 0; i < block_size; i++) {
           x_re[i] = RANDOM_F();
           x_im[i] = RANDOM_F();
           y_re[i] = RANDOM_F();
           y_im[i] = RANDOM_F();
         }
         TEST_CALL(srslte_vec_prod_ccc_split(x_re, x_im, y_re, y_im, z_re, z_im, block_size))
         for (int i = 0; i < block_size; i++) {
           gold = (x_re[i] + I * x_im[i]) * (y_re[i] + I * y_im[i]);
           mse += cabsf(gold - (z_re[i] + I*z_im[i]));
         }
         free(x_re);
         free(x_im);
         free(y_re);
         free(y_im);
         free(z_re);
         free(z_im);
 )
 TEST(srslte_vec_prod_conj_ccc,
  MALLOC(cf_t, x);
  MALLOC(cf_t, y);
@ -357,6 +408,27 @@ TEST(srslte_vec_sc_prod_ccc,
  free(z);
 )
 TEST(srslte_vec_convert_fi,
  MALLOC(float, x);
  MALLOC(short, z);
      float scale = 1000.0f;
  short gold;
  for (int i = 0; i < block_size; i++) {
    x[i] = (float) RANDOM_F();
  }
  TEST_CALL(srslte_vec_convert_fi(x, z, scale, block_size))
  for (int i = 0; i < block_size; i++) {
      gold = (short) ((x[i] * scale));
      mse += cabsf((float)gold - (float) z[i]);
  }
  free(x);
  free(z);
 )
 TEST(srslte_vec_prod_fff,
  MALLOC(float, x);
  MALLOC(float, y);
@ -376,6 +448,30 @@ TEST(srslte_vec_prod_fff,
  }
  free(x);
  free(y);
  free(z);
 )
 TEST(srslte_vec_prod_cfc,
  MALLOC(cf_t, x);
  MALLOC(float, y);
  MALLOC(cf_t, z);
  cf_t gold;
  for (int i = 0; i < block_size; i++) {
    x[i] = RANDOM_CF();
    y[i] = RANDOM_F();
  }
  TEST_CALL(srslte_vec_prod_cfc(x, y, z, block_size))
  for (int i = 0; i < block_size; i++) {
    gold = x[i] * y[i];
    mse += cabsf(gold - z[i]);
  }
  free(x);
  free(y);
  free(z);
 )
@ -461,66 +557,216 @@ TEST(srslte_vec_sc_prod_cfc,
  free(z);
 )
 TEST(srslte_vec_div_ccc,
     MALLOC(cf_t, x);
         MALLOC(cf_t, y);
         MALLOC(cf_t, z);
         cf_t gold;
         for (int i = 0; i < block_size; i++) {
           x[i] = RANDOM_CF();
           y[i] = RANDOM_CF();
         }
         TEST_CALL(srslte_vec_div_ccc(x, y, z, block_size))
         for (int i = 0; i < block_size; i++) {
           gold = x[i] / y[i];
           mse += cabsf(gold - z[i]);
         }
         mse /= block_size;
         free(x);
         free(y);
         free(z);
 )
 TEST(srslte_vec_div_cfc,
     MALLOC(cf_t, x);
         MALLOC(float, y);
         MALLOC(cf_t, z);
         cf_t gold;
         for (int i = 0; i < block_size; i++) {
           x[i] = RANDOM_CF();
           y[i] = RANDOM_F();
         }
         TEST_CALL(srslte_vec_div_cfc(x, y, z, block_size))
         for (int i = 0; i < block_size; i++) {
           gold = x[i] / y[i];
           mse += cabsf(gold - z[i])/cabsf(gold);
         }
         mse /= block_size;
         free(x);
         free(y);
         free(z);
 )
 TEST(srslte_vec_div_fff,
     MALLOC(float, x);
         MALLOC(float, y);
         MALLOC(float, z);
         cf_t gold;
         for (int i = 0; i < block_size; i++) {
           x[i] = RANDOM_F();
           y[i] = RANDOM_F();
         }
         TEST_CALL(srslte_vec_div_fff(x, y, z, block_size))
         for (int i = 0; i < block_size; i++) {
           gold = x[i] / y[i];
           mse += cabsf(gold - z[i]);
         }
         mse /= block_size;
         free(x);
         free(y);
         free(z);
 )
 TEST(srslte_vec_max_fi,
     MALLOC(float, x);
         for (int i = 0; i < block_size; i++) {
           x[i] = RANDOM_F();
         }
         uint32_t max_index = 0;
         TEST_CALL(max_index = srslte_vec_max_fi(x, block_size);)
         float gold_value = -INFINITY;
         uint32_t gold_index = 0;
         for (int i = 0; i < block_size; i++) {
           if (gold_value < x[i]) {
             gold_value = x[i];
             gold_index = i;
           }
         }
         mse = (gold_index != max_index) ? 1:0;
         free(x);
 )
 TEST(srslte_vec_max_abs_ci,
     MALLOC(cf_t, x);
         for (int i = 0; i < block_size; i++) {
           x[i] = RANDOM_CF();
         }
         uint32_t max_index = 0;
         TEST_CALL(max_index = srslte_vec_max_abs_ci(x, block_size);)
         float gold_value = -INFINITY;
         uint32_t gold_index = 0;
         for (int i = 0; i < block_size; i++) {
           cf_t a = x[i];
           float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a;
           if (abs2 > gold_value) {
             gold_value = abs2;
             gold_index = (uint32_t)i;
           }
         }
         mse = (gold_index != max_index) ? 1:0;
         free(x);
 )
 int main(int argc, char **argv) {
  char func_names[MAX_FUNCTIONS][32];
  double timmings[MAX_FUNCTIONS][MAX_BLOCKS];
  uint32_t sizes[32];
  uint32_t size_count = 0;
  uint32_t func_count = 0;
-  bool passed = true;
+  bool passed[MAX_FUNCTIONS][MAX_BLOCKS];
  bool all_passed = true;
  for (uint32_t block_size = 1; block_size <= 1024*8; block_size *= 2) {
    func_count = 0;
-    passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_acc_ff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    passed[func_count][size_count] = test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    passed[func_count][size_count] = test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    passed[func_count][size_count] = test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    passed[func_count][size_count] = test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    passed[func_count][size_count] = test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    passed[func_count][size_count] = test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    passed[func_count][size_count] = test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    passed[func_count][size_count] = test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    passed[func_count][size_count] = test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_convert_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_ccc_split(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_div_ccc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_div_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_div_fff(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_max_fi(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
-    passed &= test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size);
+    passed[func_count][size_count] = test_srslte_vec_max_abs_ci(func_names[func_count], &timmings[func_count][size_count], block_size);
    func_count++;
    sizes[size_count] = block_size;
@ -546,10 +792,11 @@ int main(int argc, char **argv) {
  for (int i = 0; i < func_count; i++) {
    printf("%32s | ", func_names[i]);
    for (int j = 0; j < size_count; j++) {
-      printf(" %7.1f", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
+      printf(" %s%7.1f\x1b[0m", (passed[i][j])?"":"\x1B[31m", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]);
      all_passed &= passed[i][j];
    }
    printf(" |\n");
  }
-  return (passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
+  return (all_passed)?SRSLTE_SUCCESS:SRSLTE_ERROR;
 }
--- a/lib/src/phy/utils/vector.c
+++ b/lib/src/phy/utils/vector.c
@ -48,18 +48,7 @@ int srslte_vec_acc_ii(int *x, uint32_t len) {
 // Used in PRACH detector, AGC and chest_dl for noise averaging
 float srslte_vec_acc_ff(float *x, uint32_t len) {
-#ifdef HAVE_VOLK_ACC_FUNCTION
+  return srslte_vec_acc_ff_simd(x, len);
  float result;
  volk_32f_accumulator_s32f(&result,x,len);
  return result;
 #else
   int i;
   float z=0;
   for (i=0;i<len;i++) {
     z+=x[i];
   }
   return z;
 #endif
 }
 void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float coeff, uint32_t len) {
@ -190,14 +179,7 @@ void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len) {
 // Chest UL 
 void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
 #ifndef LV_HAVE_SSE
  int i;
  for (i=0;i<len;i++) {
    z[i] = x[i]*h;
  }
 #else
  srslte_vec_sc_prod_ccc_simd(x,h,z,len);
 #endif
 }
 // Used in turbo decoder 
@ -217,14 +199,7 @@ void srslte_vec_convert_ci(int8_t *x, int16_t *z, uint32_t len) {
 }
 void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len) {
-#ifndef LV_HAVE_SSE
+  srslte_vec_convert_fi_simd(x, z, scale, len);
  int i;
  for (i=0;i<len;i++) {
    z[i] = (int16_t) (x[i]*scale);
  }
 #else 
  srslte_vec_convert_fi_sse(x, z, scale, len);
 #endif
 }
 void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
@ -234,13 +209,7 @@ void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
 }
 void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len) {
-#ifndef LV_HAVE_SSE
+  srslte_vec_lut_sss_simd(x, lut, y, len);
  for (int i=0;i<len;i++) {
    y[lut[i]] = x[i];
  }
 #else
  srslte_vec_lut_sss_sse(x, lut, y, len);
 #endif
 }
 void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint32_t len) {
@ -280,7 +249,7 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
 */
 void *srslte_vec_malloc(uint32_t size) {
  void *ptr;
-  if (posix_memalign(&ptr,512,size)) {
+  if (posix_memalign(&ptr, SRSLTE_SIMD_BIT_ALIGN, size)) {
    return NULL;
  } else {
    return ptr;
@ -292,7 +261,7 @@ void *srslte_vec_realloc(void *ptr, uint32_t old_size, uint32_t new_size) {
  return realloc(ptr, new_size);
 #else
  void *new_ptr;
-  if (posix_memalign(&new_ptr,256,new_size)) {
+  if (posix_memalign(&new_ptr, SRSLTE_SIMD_BIT_ALIGN, new_size)) {
    return NULL;
  } else {
    memcpy(new_ptr, ptr, old_size);
@ -415,6 +384,7 @@ void srslte_vec_load_file(char *filename, void *buffer, uint32_t len) {
 // Used in PSS
 void srslte_vec_conj_cc(cf_t *x, cf_t *y, uint32_t len) {
  /* This function is used in initialisation only, then no optimisation is required */
  int i;
  for (i=0;i<len;i++) {
    y[i] = conjf(x[i]);
@ -423,10 +393,7 @@ void srslte_vec_conj_cc(cf_t *x, cf_t *y, uint32_t len) {
 // Used in scrambling complex 
 void srslte_vec_prod_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
-  int i;
+  srslte_vec_prod_cfc_simd(x, y, z, len);
  for (i=0;i<len;i++) {
    z[i] = x[i]*y[i];
  }
 }
 // Used in scrambling float
@ -444,6 +411,10 @@ void srslte_vec_prod_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
  srslte_vec_prod_ccc_simd(x,y,z,len);
 }
 void srslte_vec_prod_ccc_split(float *x_re, float *x_im, float *y_re, float *y_im, float *z_re, float *z_im, uint32_t len) {
  srslte_vec_prod_ccc_split_simd(x_re, x_im, y_re , y_im, z_re,z_im, len);
 }
 // PRACH, CHEST UL, etc. 
 void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
  srslte_vec_prod_conj_ccc_simd(x,y,z,len);
@ -452,40 +423,17 @@ void srslte_vec_prod_conj_ccc(cf_t *x,cf_t *y, cf_t *z, uint32_t len) {
 //#define DIV_USE_VEC
 // Used in SSS 
-/* Complex division is conjugate multiplication + real division */
+void srslte_vec_div_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
-void srslte_vec_div_ccc(cf_t *x, cf_t *y, float *y_mod, cf_t *z, float *z_real, float *z_imag, uint32_t len) {
+  srslte_vec_div_ccc_simd(x, y, z, len);
 #ifdef DIV_USE_VEC
  srslte_vec_prod_conj_ccc(x,y,z,len);
  srslte_vec_abs_square_cf(y,y_mod,len);
  srslte_vec_div_cfc(z,y_mod,z,z_real,z_imag,len);  
 #else 
  int i; 
  for (i=0;i<len;i++) {
    z[i] = x[i] / y[i]; 
  }
 #endif
 }
 /* Complex division by float z=x/y */
-void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, float *z_real, float *z_imag, uint32_t len) {
+void srslte_vec_div_cfc(cf_t *x, float *y, cf_t *z, uint32_t len) {
-#ifdef DIV_USE_VEC
+  srslte_vec_div_cfc_simd(x, y, z, len);
  srslte_vec_deinterleave_cf(x, z_real, z_imag, len);
  srslte_vec_div_fff(z_real, y, z_real, len);
  srslte_vec_div_fff(z_imag, y, z_imag, len);
  srslte_vec_interleave_cf(z_real, z_imag, z, len);
 #else
  int i; 
  for (i=0;i<len;i++) {
    z[i] = x[i] / y[i]; 
  }
 #endif
 }
 void srslte_vec_div_fff(float *x, float *y, float *z, uint32_t len) {
-  int i;
+  srslte_vec_div_fff_simd(x, y, z, len);
  for (i=0;i<len;i++) {
    z[i] = x[i] / y[i];
  }
 }
 // PSS. convolution 
@ -554,30 +502,7 @@ void srslte_vec_arg_cf(cf_t *x, float *arg, uint32_t len) {
 }
 uint32_t srslte_vec_max_fi(float *x, uint32_t len) {
-
+  return srslte_vec_max_fi_simd(x, len);
  // This is to solve an issue with incorrect type of 1st parameter in version 1.2 of volk
 #ifdef HAVE_VOLK_MAX_FUNCTION_32
  uint32_t target=0;
  volk_32f_index_max_32u(&target,x,len);
  return target;
 #else
 #ifdef HAVE_VOLK_MAX_FUNCTION_16
  uint32_t target=0;
  volk_32f_index_max_16u(&target,x,len);
  return target;
 #else
  uint32_t i;
  float m=-FLT_MAX;
  uint32_t p=0;
  for (i=0;i<len;i++) {
    if (x[i]>m) {
      m=x[i];
      p=i;
    }
  }
  return p;
 #endif
 #endif
 }
 int16_t srslte_vec_max_star_si(int16_t *x, uint32_t len) {
@ -616,30 +541,7 @@ void srslte_vec_max_fff(float *x, float *y, float *z, uint32_t len) {
 // CP autocorr
 uint32_t srslte_vec_max_abs_ci(cf_t *x, uint32_t len) {
-#ifdef HAVE_VOLK_MAX_ABS_FUNCTION_32
+  return srslte_vec_max_ci_simd(x, len);
  uint32_t target=0;
  volk_32fc_index_max_32u(&target,x,len);
  return target;
 #else
 #ifdef HAVE_VOLK_MAX_ABS_FUNCTION_16
  uint32_t target=0;
  volk_32fc_index_max_16u(&target,x,len);
  return target;
 #else
  uint32_t i;
  float m=-FLT_MAX;
  uint32_t p=0;
  float tmp;
  for (i=0;i<len;i++) {
    tmp = crealf(x[i])*crealf(x[i]) + cimagf(x[i])*cimagf(x[i]);
    if (tmp>m) {
      m=tmp;
      p=i;
    }
  }
  return p;
 #endif
 #endif
 }
 void srslte_vec_quant_fuc(float *in, uint8_t *out, float gain, float offset, float clip, uint32_t len) {
--- a/lib/src/phy/utils/vector_simd.c
+++ b/lib/src/phy/utils/vector_simd.c
@ -232,143 +232,113 @@ void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)
 /* No improvement with AVX */
-void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len)
+void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len) {
-{
+  int i = 0;
 #ifdef DEBUG_MODE
  for (int i=0;i<len;i++) {
    y[lut[i]] = x[i];
  }
 #else
 #ifdef LV_HAVE_SSE
-  unsigned int number = 0;
+#if CMAKE_BUILD_TYPE!=Debug
-  const unsigned int points = len / 8;
+
-
+  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
-  const __m128i* xPtr = (const __m128i*) x;
+    for (; i < len - 7; i += 8) {
-  const __m128i* lutPtr = (__m128i*) lut;
+      __m128i xVal = _mm_load_si128((__m128i *) &x[i]);
-
+      __m128i lutVal = _mm_load_si128((__m128i *) &lut[i]);
-  __m128i xVal, lutVal;
+
-  for(;number < points; number++){
+      for (int k = 0; k < 8; k++) {
-
+        int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
-    xVal   = _mm_loadu_si128(xPtr);
+        uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
-    lutVal = _mm_loadu_si128(lutPtr);
+        y[l] = (short) x;
-    
+      }
-    for (int i=0;i<8;i++) {
+    }
-      int16_t x = (int16_t)   _mm_extract_epi16(xVal, i); 
+  } else {
-      uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, i);
+    for (; i < len - 7; i += 8) {
-      y[l] = x;
+      __m128i xVal = _mm_loadu_si128((__m128i *) &x[i]);
      __m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]);
      for (int k = 0; k < 8; k++) {
        int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
        uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
        y[l] = (short) x;
      }
    }
    xPtr ++;
    lutPtr ++;
  }
 #endif
 #endif
-  number = points * 8;
+  for (; i < len; i++) {
-  for(;number < len; number++){
+    y[lut[i]] = x[i];
    y[lut[number]] = x[number];
  }
 #endif  
 #endif
 }
 /* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */
-void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len)
+void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len) {
-{
+  int i = 0;
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int eighthPoints = len / 8;
-  const float* inputVectorPtr = (const float*)x;
+#if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE
-  int16_t* outputVectorPtr = z;
+  simd_f_t s = srslte_simd_f_set1(scale);
  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
      simd_f_t a = srslte_simd_f_load(&x[i]);
      simd_f_t b = srslte_simd_f_load(&x[i + SRSLTE_SIMD_F_SIZE]);
-  __m128 vScalar = _mm_set_ps1(scale);
+      simd_f_t sa = srslte_simd_f_mul(a, s);
-  __m128 inputVal1, inputVal2;
+      simd_f_t sb = srslte_simd_f_mul(b, s);
  __m128i intInputVal1, intInputVal2;
  __m128 ret1, ret2;
-  for(;number < eighthPoints; number++){
+      simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);
    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    ret1 = _mm_mul_ps(inputVal1, vScalar);
+      srslte_simd_s_store(&z[i], i16);
-    ret2 = _mm_mul_ps(inputVal2, vScalar);
+    }
  } else {
    for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
      simd_f_t a = srslte_simd_f_loadu(&x[i]);
      simd_f_t b = srslte_simd_f_loadu(&x[i + SRSLTE_SIMD_F_SIZE]);
-    intInputVal1 = _mm_cvtps_epi32(ret1);
+      simd_f_t sa = srslte_simd_f_mul(a, s);
-    intInputVal2 = _mm_cvtps_epi32(ret2);
+      simd_f_t sb = srslte_simd_f_mul(b, s);
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+      simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+      srslte_simd_s_storeu(&z[i], i16);
-    outputVectorPtr += 8;
+    }
  }
 #endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE */
-  number = eighthPoints * 8;
+  for(; i < len; i++){
-  for(; number < len; number++){
+    z[i] = (int16_t) (x[i] * scale);
    z[number] = (int16_t) (x[number] * scale);
  }
 #endif
 }
 float srslte_vec_acc_ff_simd(float *x, int len) {
  int i = 0;
  float acc_sum = 0.0f;
-// for enb no-volk
+#if SRSLTE_SIMD_F_SIZE
-void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len) {
+  simd_f_t simd_sum = srslte_simd_f_zero();
 #ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 4;
  const float* xPtr = (const float*) x;
  const float* yPtr = (const float*) y;
  float* zPtr = (float*) z;
  __m128 xVal, yVal, zVal;
  for(;number < points; number++){
    xVal = _mm_loadu_ps(xPtr);
    yVal = _mm_loadu_ps(yPtr);
-    zVal = _mm_add_ps(xVal, yVal);
+  if (SRSLTE_IS_ALIGNED(x)) {
    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
      simd_f_t a = srslte_simd_f_load(&x[i]);
-    _mm_storeu_ps(zPtr, zVal);
+      simd_sum = srslte_simd_f_add(simd_sum, a);
    }
  } else {
    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
      simd_f_t a = srslte_simd_f_loadu(&x[i]);
-    xPtr += 4;
+      simd_sum = srslte_simd_f_add(simd_sum, a);
-    yPtr += 4;
+    }
    zPtr += 4;
  }
-  number = points * 4;
+  __attribute__((aligned(SRSLTE_SIMD_F_SIZE*4))) float sum[SRSLTE_SIMD_F_SIZE];
-  for(;number < len; number++){
+  srslte_simd_f_store(sum, simd_sum);
-    z[number] = x[number] + y[number];
+  for (int k = 0; k < SRSLTE_SIMD_F_SIZE; k++) {
    acc_sum += sum[k];
  }
 #endif
 }
 void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) {
 #ifdef LV_HAVE_AVX
  unsigned int number = 0;
  const unsigned int points = len / 8;
  const float* xPtr = (const float*) x;
  const float* yPtr = (const float*) y;
  float* zPtr = (float*) z;
  __m256 xVal, yVal, zVal;
  for(;number < points; number++){
-    xVal = _mm256_loadu_ps(xPtr);
+  for (; i<len; i++) {
-    yVal = _mm256_loadu_ps(yPtr);
+    acc_sum += x[i];
    zVal = _mm256_add_ps(xVal, yVal);
    _mm256_storeu_ps(zPtr, zVal);
    xPtr += 8;
    yPtr += 8;
    zPtr += 8;
  }
-  for(number = points * 8;number < len; number++){
+  return acc_sum;
    z[number] = x[number] + y[number];
  }
 #endif
 }
 cf_t srslte_vec_acc_cc_simd(cf_t *x, int len) {
@ -570,6 +540,34 @@ cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len)
  return result;
 }
 void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len) {
  int i = 0;
 #if SRSLTE_SIMD_CF_SIZE
  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
      simd_f_t s = srslte_simd_f_load(&y[i]);
      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
      simd_cf_t r = srslte_simd_cf_mul(a, s);
      srslte_simd_cfi_store(&z[i], r);
    }
  } else {
    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
      simd_f_t s = srslte_simd_f_loadu(&y[i]);
      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
      simd_cf_t r = srslte_simd_cf_mul(a, s);
      srslte_simd_cfi_storeu(&z[i], r);
    }
  }
 #endif
  for (; i<len; i++) {
    z[i] = x[i] * y[i];
  }
 }
 void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len) {
  int i = 0;
@ -630,17 +628,29 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
  }
 }
-void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
+void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
  int i = 0;
 #if SRSLTE_SIMD_F_SIZE
-  for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
+  if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
-    simd_cf_t a = srslte_simd_cf_load(&a_re[i], &a_im[i]);
+      SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
-    simd_cf_t b = srslte_simd_cf_load(&b_re[i], &b_im[i]);
+    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
      simd_cf_t a = srslte_simd_cf_load(&a_re[i], &a_im[i]);
      simd_cf_t b = srslte_simd_cf_load(&b_re[i], &b_im[i]);
-    simd_cf_t r = srslte_simd_cf_prod(a, b);
+      simd_cf_t r = srslte_simd_cf_prod(a, b);
      srslte_simd_cf_store(&r_re[i], &r_im[i], r);
    }
  } else {
    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
      simd_cf_t a = srslte_simd_cf_loadu(&a_re[i], &a_im[i]);
      simd_cf_t b = srslte_simd_cf_loadu(&b_re[i], &b_im[i]);
      simd_cf_t r = srslte_simd_cf_prod(a, b);
-    srslte_simd_cf_store(&r_re[i], &r_im[i], r);
+      srslte_simd_cf_storeu(&r_re[i], &r_im[i], r);
    }
  }
 #endif
@ -655,13 +665,25 @@ void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, i
  int i = 0;
 #if SRSLTE_SIMD_C16_SIZE
-  for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
+  if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
-    simd_c16_t a = srslte_simd_c16_load(&a_re[i], &a_im[i]);
+      SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
-    simd_c16_t b = srslte_simd_c16_load(&b_re[i], &b_im[i]);
+    for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
      simd_c16_t a = srslte_simd_c16_load(&a_re[i], &a_im[i]);
      simd_c16_t b = srslte_simd_c16_load(&b_re[i], &b_im[i]);
      simd_c16_t r = srslte_simd_c16_prod(a, b);
      srslte_simd_c16_store(&r_re[i], &r_im[i], r);
    }
  } else {
    for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
      simd_c16_t a = srslte_simd_c16_loadu(&a_re[i], &a_im[i]);
      simd_c16_t b = srslte_simd_c16_loadu(&b_re[i], &b_im[i]);
-    simd_c16_t r = srslte_simd_c16_prod(a, b);
+      simd_c16_t r = srslte_simd_c16_prod(a, b);
-    srslte_simd_c16_store(&r_re[i], &r_im[i], r);
+      srslte_simd_c16_storeu(&r_re[i], &r_im[i], r);
    }
  }
 #endif
@ -701,6 +723,103 @@ void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
  }
 }
 void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
  int i = 0;
 #if SRSLTE_SIMD_CF_SIZE
  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
      simd_cf_t b = srslte_simd_cfi_load(&y[i]);
      simd_cf_t rcpb = srslte_simd_cf_rcp(b);
      simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
      srslte_simd_cfi_store(&z[i], r);
    }
  } else {
    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
      simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
      simd_cf_t b = srslte_simd_cfi_loadu(&y[i]);
      simd_cf_t rcpb = srslte_simd_cf_rcp(b);
      simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
      srslte_simd_cfi_storeu(&z[i], r);
    }
  }
 #endif
  for (; i < len; i++) {
    z[i] = x[i] / y[i];
  }
 }
 void srslte_vec_div_cfc_simd(cf_t *x,float *y, cf_t *z, int len) {
  int i = 0;
 #if SRSLTE_SIMD_CF_SIZE && SRSLTE_SIMD_CF_SIZE == SRSLTE_SIMD_F_SIZE
  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
      simd_cf_t a = srslte_simd_cfi_load(&x[i]);
      simd_f_t b = srslte_simd_f_load(&y[i]);
      simd_f_t rcpb = srslte_simd_f_rcp(b);
      simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
      srslte_simd_cfi_store(&z[i], r);
    }
  } else {
    for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
      simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
      simd_f_t b = srslte_simd_f_loadu(&y[i]);
      simd_f_t rcpb = srslte_simd_f_rcp(b);
      simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
      srslte_simd_cfi_storeu(&z[i], r);
    }
  }
 #endif
  for (; i < len; i++) {
    z[i] = x[i] / y[i];
  }
 }
 void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len) {
  int i = 0;
 #if SRSLTE_SIMD_F_SIZE
  if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
      simd_f_t a = srslte_simd_f_load(&x[i]);
      simd_f_t b = srslte_simd_f_load(&y[i]);
      simd_f_t rcpb = srslte_simd_f_rcp(b);
      simd_f_t r = srslte_simd_f_mul(a, rcpb);
      srslte_simd_f_store(&z[i], r);
    }
  } else {
    for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
      simd_f_t a = srslte_simd_f_loadu(&x[i]);
      simd_f_t b = srslte_simd_f_loadu(&y[i]);
      simd_f_t rcpb = srslte_simd_f_rcp(b);
      simd_f_t r = srslte_simd_f_mul(a, rcpb);
      srslte_simd_f_storeu(&z[i], r);
    }
  }
 #endif
  for (; i < len; i++) {
    z[i] = x[i] / y[i];
  }
 }
 void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
  int i = 0;
@ -895,3 +1014,137 @@ void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len) {
    dst[i] = src[i];
  }
 }
 uint32_t srslte_vec_max_fi_simd(float *x, int len) {
  int i = 0;
  float max_value = -INFINITY;
  uint32_t max_index = 0;
 #if SRSLTE_SIMD_I_SIZE
  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
  simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
  simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
  simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
  simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
  if (SRSLTE_IS_ALIGNED(x)) {
    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
      simd_f_t a = srslte_simd_f_load(&x[i]);
      simd_i_t res = srslte_simd_f_max(a, simd_max_values);
      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
    }
  } else {
    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
      simd_f_t a = srslte_simd_f_loadu(&x[i]);
      simd_i_t res = srslte_simd_f_max(a, simd_max_values);
      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
    }
  }
  srslte_simd_i_store(indexes_buffer, simd_max_indexes);
  srslte_simd_f_store(values_buffer, simd_max_values);
  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
    if (values_buffer[k] > max_value) {
      max_value = values_buffer[k];
      max_index = (uint32_t) indexes_buffer[k];
    }
  }
 #endif /* SRSLTE_SIMD_I_SIZE */
  for (; i < len; i++) {
    if (x[i] > max_value) {
      max_value = x[i];
      max_index = (uint32_t)i;
    }
  }
  return max_index;
 }
 uint32_t srslte_vec_max_ci_simd(cf_t *x, int len) {
  int i = 0;
  float max_value = -INFINITY;
  uint32_t max_index = 0;
 #if SRSLTE_SIMD_I_SIZE
  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
  __attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
  simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
  simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
  simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
  simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
  if (SRSLTE_IS_ALIGNED(x)) {
    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
      simd_f_t x1 = srslte_simd_f_load((float *) &x[i]);
      simd_f_t x2 = srslte_simd_f_load((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
      simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
    }
  } else {
    for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
      simd_f_t x1 = srslte_simd_f_loadu((float *) &x[i]);
      simd_f_t x2 = srslte_simd_f_loadu((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
      simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
      simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
      simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
      simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
      simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
      simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
      simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
    }
  }
  srslte_simd_i_store(indexes_buffer, simd_max_indexes);
  srslte_simd_f_store(values_buffer, simd_max_values);
  for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
    if (values_buffer[k] > max_value) {
      max_value = values_buffer[k];
      max_index = (uint32_t) indexes_buffer[k];
    }
  }
 #endif /* SRSLTE_SIMD_I_SIZE */
  for (; i < len; i++) {
    cf_t a = x[i];
    float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a;
    if (abs2 > max_value) {
      max_value = abs2;
      max_index = (uint32_t)i;
    }
  }
  return max_index;
 }