Conditional SSE compilation

9 years ago · 6c194dc078
parent 438a5aa240
commit 6c194dc078
26 changed files with 434 additions and 190 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -83,8 +83,18 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
    #Any additional flags for CXX
 ENDIF(CMAKE_COMPILER_IS_GNUCXX)

+FIND_PACKAGE(SSE)
+
 IF(CMAKE_COMPILER_IS_GNUCC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g -mfpmath=sse -mavx -O3")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g -march=native -O3")
+    IF(AVX_FOUND) 
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpmath=sse -mavx -DLV_HAVE_AVX -DLV_HAVE_SSE")      
+    ELSEIF(SSE4_2_FOUND)
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpmath=sse -msse_4.2 -DLV_HAVE_SSE")
+    ELSEIF(SSE4_1_FOUND)
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpmath=sse -msse_4.1 -DLV_HAVE_SSE")
+    ENDIF(AVX_FOUND)
+    
   # IF(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
   #   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -Wno-error=implicit-function-declaration -Wno-error=unused-but-set-variable")
   # ENDIF(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
--- a/cmake/modules/FindVolk.cmake
+++ b/cmake/modules/FindVolk.cmake
@ -47,6 +47,7 @@ CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_subtract_32f HAVE_VOLK_SUB_FLOAT_FUNCTION
 CHECK_FUNCTION_EXISTS_MATH(volk_32fc_x2_square_dist_32f HAVE_VOLK_SQUARE_DIST_FUNCTION)
 CHECK_FUNCTION_EXISTS_MATH(volk_32fc_deinterleave_real_32f HAVE_VOLK_DEINTERLEAVE_FUNCTION)
 CHECK_FUNCTION_EXISTS_MATH(volk_32fc_index_max_16u HAVE_VOLK_MAX_ABS_FUNCTION)
+CHECK_FUNCTION_EXISTS_MATH(volk_16i_s32f_convert_32f HAVE_VOLK_CONVERT_IF_FUNCTION)

 INCLUDE(FindPackageHandleStandardArgs)
 FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
@ -54,6 +55,7 @@ MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS VOLK_DEFINITIONS)

 IF(VOLK_FOUND)
  SET(CMAKE_REQUIRED_LIBRARIES ${VOLK_LIBRARIES} m)
+  CHECK_FUNCTION_EXISTS_MATH(volk_16i_s32f_convert_32f HAVE_VOLK_CONVERT_IF_FUNCTION)
  CHECK_FUNCTION_EXISTS_MATH(volk_32f_index_max_16u HAVE_VOLK_MAX_FUNCTION)
  CHECK_FUNCTION_EXISTS_MATH(volk_32f_x2_max_32f HAVE_VOLK_MAX_VEC_FUNCTION)
  CHECK_FUNCTION_EXISTS_MATH(volk_32f_accumulator_s32f HAVE_VOLK_ACC_FUNCTION)
--- a/srslte/include/srslte/mimo/precoding.h
+++ b/srslte/include/srslte/mimo/precoding.h
@ -85,8 +85,7 @@ SRSLTE_API int srslte_precoding_type(srslte_precoding_t *q,

 /* Estimates the vector "x" based on the received signal "y" and the channel estimates "h"
 */
-SRSLTE_API int srslte_predecoding_single(srslte_precoding_t *q, 
-                                         cf_t *y, 
+SRSLTE_API int srslte_predecoding_single(cf_t *y, 
                                         cf_t *h, 
                                         cf_t *x, 
                                         int nof_symbols, 
--- a/srslte/include/srslte/phch/sch.h
+++ b/srslte/include/srslte/phch/sch.h
@ -40,8 +40,7 @@
 #include "srslte/common/phy_common.h"
 #include "srslte/fec/rm_turbo.h"
 #include "srslte/fec/turbocoder.h"
-#include "srslte/fec/turbodecoder_gen.h"
-#include "srslte/fec/turbodecoder_sse.h"
+#include "srslte/fec/turbodecoder.h"
 #include "srslte/fec/crc.h"
 #include "srslte/phch/pdsch_cfg.h"
 #include "srslte/phch/pusch_cfg.h"
@ -66,12 +65,12 @@ typedef struct SRSLTE_API {
  uint8_t *parity_bits;  
  void *e;
  uint8_t *temp_g_bits;
-  uint32_t *ul_interleaver;
+  uint16_t *ul_interleaver;
  srslte_uci_bit_t ack_ri_bits[12*288];
  uint32_t nof_ri_ack_bits; 
  
  srslte_tcod_t encoder;
-  srslte_tdec_sse_t decoder;  
+  srslte_tdec_t decoder;  
  srslte_crc_t crc_tb;
  srslte_crc_t crc_cb;
  
--- a/srslte/include/srslte/srslte.h
+++ b/srslte/include/srslte/srslte.h
@ -63,8 +63,7 @@
 #include "srslte/fec/crc.h"
 #include "srslte/fec/tc_interl.h"
 #include "srslte/fec/turbocoder.h"
-#include "srslte/fec/turbodecoder_sse.h"
-#include "srslte/fec/turbodecoder_gen.h"
+#include "srslte/fec/turbodecoder.h"
 #include "srslte/fec/cbsegm.h"
 #include "srslte/fec/rm_conv.h"
 #include "srslte/fec/rm_turbo.h"
--- a/srslte/include/srslte/utils/vector.h
+++ b/srslte/include/srslte/utils/vector.h
@ -109,7 +109,7 @@ SRSLTE_API void srslte_vec_sc_div2_sss(short *x, int pow2_div, short *z, uint32_
 SRSLTE_API void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len);

 SRSLTE_API void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len);
-
+SRSLTE_API void srslte_vec_convert_if(int16_t *x, float *z, float scale, uint32_t len);

 SRSLTE_API void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len);
 SRSLTE_API void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len); 
--- a/srslte/lib/ch_estimation/test/chest_test_dl.c
+++ b/srslte/lib/ch_estimation/test/chest_test_dl.c
@ -102,24 +102,24 @@ int main(int argc, char **argv) {

  num_re = 2 * cell.nof_prb * SRSLTE_NRE * SRSLTE_CP_NSYMB(cell.cp);

-  input = malloc(num_re * sizeof(cf_t));
+  input = srslte_vec_malloc(num_re * sizeof(cf_t));
  if (!input) {
-    perror("malloc");
+    perror("srslte_vec_malloc");
    goto do_exit;
  }
-  output = malloc(num_re * sizeof(cf_t));
+  output = srslte_vec_malloc(num_re * sizeof(cf_t));
  if (!output) {
-    perror("malloc");
+    perror("srslte_vec_malloc");
    goto do_exit;
  }
-  h = malloc(num_re * sizeof(cf_t));
+  h = srslte_vec_malloc(num_re * sizeof(cf_t));
  if (!h) {
-    perror("malloc");
+    perror("srslte_vec_malloc");
    goto do_exit;
  }
-  ce = malloc(num_re * sizeof(cf_t));
+  ce = srslte_vec_malloc(num_re * sizeof(cf_t));
  if (!ce) {
-    perror("malloc");
+    perror("srslte_vec_malloc");
    goto do_exit;
  }

@ -173,7 +173,7 @@ int main(int argc, char **argv) {
        
        gettimeofday(&t[1], NULL);
        for (int j=0;j<100;j++) {
-          srslte_predecoding_single(&cheq, input, ce, output, num_re, 0);
+          srslte_predecoding_single(input, ce, output, num_re, 0);
        }
        gettimeofday(&t[2], NULL);
        get_time_interval(t);
@ -188,7 +188,7 @@ int main(int argc, char **argv) {

        gettimeofday(&t[1], NULL);
        for (int j=0;j<100;j++) {
-          srslte_predecoding_single(&cheq, input, ce, output, num_re, srslte_chest_dl_get_noise_estimate(&est));
+          srslte_predecoding_single(input, ce, output, num_re, srslte_chest_dl_get_noise_estimate(&est));
        }
        gettimeofday(&t[2], NULL);
        get_time_interval(t);
--- a/srslte/lib/ch_estimation/test/chest_test_dl_mex.c
+++ b/srslte/lib/ch_estimation/test/chest_test_dl_mex.c
@ -189,7 +189,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
    }    
       
    if (cell.nof_ports == 1) {
-      srslte_predecoding_single(&cheq, input_signal, ce[0], output_signal2, nof_re, srslte_chest_dl_get_noise_estimate(&chest));            
+      srslte_predecoding_single(input_signal, ce[0], output_signal2, nof_re, srslte_chest_dl_get_noise_estimate(&chest));            
    } else {
      srslte_predecoding_diversity(&cheq, input_signal, ce, output_signal, cell.nof_ports, nof_re, srslte_chest_dl_get_noise_estimate(&chest));
      srslte_layerdemap_diversity(output_signal, output_signal2, cell.nof_ports, nof_re/cell.nof_ports);
--- a/srslte/lib/dft/src/ofdm.c
+++ b/srslte/lib/dft/src/ofdm.c
@ -181,7 +181,7 @@ void srslte_ofdm_rx_sf(srslte_ofdm_t *q, cf_t *input, cf_t *output) {
    srslte_vec_prod_ccc(input, q->shift_buffer, input, 2*q->slot_sz);
  }
  for (n=0;n<2;n++) {
-    srslte_ofdm_rx_slot_zerocopy(q, &input[n*q->slot_sz], &output[n*q->nof_re*q->nof_symbols]);
+    srslte_ofdm_rx_slot(q, &input[n*q->slot_sz], &output[n*q->nof_re*q->nof_symbols]);
  }
 }

--- a/srslte/lib/fec/src/rm_turbo.c
+++ b/srslte/lib/fec/src/rm_turbo.c
@ -37,14 +37,16 @@
 #include "srslte/utils/vector.h"
 #include "srslte/fec/cbsegm.h"

-#define HAVE_SIMD

-#ifdef HAVE_SIMD
+#ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
-#include <tmmintrin.h>
-
-int srslte_rm_turbo_rx_lut_simd(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
+#include <pmmintrin.h>
+int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
+#endif

+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx);
 #endif

 #define NCOLS 32
@ -286,7 +288,12 @@ int srslte_rm_turbo_tx_lut(uint8_t *w_buff, uint8_t *systematic, uint8_t *parity

 int srslte_rm_turbo_rx_lut(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) 
 {
-#ifndef HAVE_SIMD
+#ifdef LV_HAVE_AVX
+  return srslte_rm_turbo_rx_lut_avx(input, output, in_len, cb_idx, rv_idx);
+#else 
+  #ifdef LV_HAVE_SSE
+    return srslte_rm_turbo_rx_lut_sse(input, output, in_len, cb_idx, rv_idx);
+  #else
    if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
      uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
      uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
@ -300,15 +307,13 @@ int srslte_rm_turbo_rx_lut(int16_t *input, int16_t *output, uint32_t in_len, uin
      printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
      return SRSLTE_ERROR_INVALID_INPUTS; 
    }
-#else
-  return srslte_rm_turbo_rx_lut_simd(input, output, in_len, cb_idx, rv_idx);
+  #endif
 #endif
 }

-#ifdef HAVE_SIMD
-
+#ifdef LV_HAVE_SSE

-int srslte_rm_turbo_rx_lut_simd(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) 
+int srslte_rm_turbo_rx_lut_sse(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) 
 {
  if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
    uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
@ -381,7 +386,116 @@ int srslte_rm_turbo_rx_lut_simd(int16_t *input, int16_t *output, uint32_t in_len
 #endif


+#ifdef LV_HAVE_AVX
+
+#define SAVE_OUTPUT(j) x  = (int16_t) _mm256_extract_epi16(xVal,   j);\
+                       l = (uint16_t) _mm256_extract_epi16(lutVal, j);\
+                       output[l] += x;
+
+
+int srslte_rm_turbo_rx_lut_avx(int16_t *input, int16_t *output, uint32_t in_len, uint32_t cb_idx, uint32_t rv_idx) 
+{
+  if (rv_idx < 4 && cb_idx < SRSLTE_NOF_TC_CB_SIZES) {
+    uint32_t out_len = 3*srslte_cbsegm_cbsize(cb_idx)+12;
+    uint16_t *deinter = deinterleaver[cb_idx][rv_idx];
+    
+    const __m256i* xPtr   = (const __m256i*) input;
+    const __m256i* lutPtr = (const __m256i*) deinter;
+    __m256i xVal, lutVal;
+    
+    int16_t x; 
+    uint16_t l;
+    
+    /* Simplify load if we do not need to wrap (ie high rates) */
+    if (in_len <= out_len) {
+      for (int i=0;i<in_len/16;i++) {
+        xVal   = _mm256_loadu_si256(xPtr);
+        lutVal = _mm256_loadu_si256(lutPtr);
+        SAVE_OUTPUT(0);
+        SAVE_OUTPUT(1);
+        SAVE_OUTPUT(2);
+        SAVE_OUTPUT(3);
+        SAVE_OUTPUT(4);
+        SAVE_OUTPUT(5);
+        SAVE_OUTPUT(6);
+        SAVE_OUTPUT(7);
+        
+        SAVE_OUTPUT(8);
+        SAVE_OUTPUT(9);
+        SAVE_OUTPUT(10);
+        SAVE_OUTPUT(11);
+        SAVE_OUTPUT(12);
+        SAVE_OUTPUT(13);
+        SAVE_OUTPUT(14);
+        SAVE_OUTPUT(15);
+        
+        xPtr ++;
+        lutPtr ++;
+      }
+      for (int i=16*(in_len/16);i<in_len;i++) {      
+        output[deinter[i%out_len]] += input[i];
+      }
+    } else {
+      int intCnt = 16;
+      int inputCnt = 0;
+      int nwrapps = 0; 
+      while(inputCnt < in_len - 16) {
+        xVal   = _mm256_loadu_si256(xPtr);
+        lutVal = _mm256_loadu_si256(lutPtr);
+      
+        SAVE_OUTPUT(0);
+        SAVE_OUTPUT(1);
+        SAVE_OUTPUT(2);
+        SAVE_OUTPUT(3);
+        SAVE_OUTPUT(4);
+        SAVE_OUTPUT(5);
+        SAVE_OUTPUT(6);
+        SAVE_OUTPUT(7);
+        
+        SAVE_OUTPUT(8);
+        SAVE_OUTPUT(9);
+        SAVE_OUTPUT(10);
+        SAVE_OUTPUT(11);
+        SAVE_OUTPUT(12);
+        SAVE_OUTPUT(13);
+        SAVE_OUTPUT(14);
+        SAVE_OUTPUT(15);
+        xPtr++;
+        lutPtr++;
+        intCnt   += 16;
+        inputCnt += 16;
+        if (intCnt >= out_len && inputCnt < in_len - 16) {
+          /* Copy last elements */
+          if ((out_len%16) == 12) {
+            for (int j=(nwrapps+1)*out_len-12;j<(nwrapps+1)*out_len;j++) {      
+              output[deinter[j%out_len]] += input[j];
+              inputCnt++;
+            }
+          } else {
+            for (int j=(nwrapps+1)*out_len-4;j<(nwrapps+1)*out_len;j++) {      
+              output[deinter[j%out_len]] += input[j];
+              inputCnt++;
+            }
+          }
+          /* And wrap pointers */
+          nwrapps++;
+          intCnt = 16; 
+          xPtr   = (const __m256i*) &input[nwrapps*out_len];
+          lutPtr = (const __m256i*) deinter;
+        }
+      }      
+      for (int i=inputCnt;i<in_len;i++) {      
+        output[deinter[i%out_len]] += input[i];
+      }
+    }    
+    return 0;    
+  } else {
+    printf("Invalid inputs rv_idx=%d, cb_idx=%d\n", rv_idx, cb_idx);
+    return SRSLTE_ERROR_INVALID_INPUTS; 
+  }
+}

+#endif



--- a/srslte/lib/fec/src/turbodecoder_gen.c
+++ b/srslte/lib/fec/src/turbodecoder_gen.c
@ -391,7 +391,7 @@ int srslte_tdec_gen_run_all(srslte_tdec_gen_t * h, float * input, uint8_t *outpu
    iter++;
  } while (iter < nof_iterations);

-  srslte_tdec_gen_decision(h, output, long_cb);
+  srslte_tdec_gen_decision_byte(h, output, long_cb);
  
  return SRSLTE_SUCCESS;
 }
--- a/srslte/lib/fec/src/turbodecoder_sse.c
+++ b/srslte/lib/fec/src/turbodecoder_sse.c
@ -37,8 +37,11 @@

 #include <inttypes.h>

+#ifdef LV_HAVE_SSE
 #include <emmintrin.h>
-#include <immintrin.h>
+#include <nmmintrin.h>
+#endif
+

 #define NUMSTATES       8
 #define NINPUTS         2
@ -55,6 +58,8 @@
 *
 ************************************************/

+#ifdef LV_HAVE_SSE
+
 static inline int16_t hMax(__m128i buffer)
 {
  __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer);
@ -626,3 +631,7 @@ int srslte_tdec_sse_run_all(srslte_tdec_sse_t * h, int16_t * input, uint8_t *out
  
  return SRSLTE_SUCCESS;
 }
+
+#endif
+
+
--- a/srslte/lib/fec/test/turbodecoder_test.c
+++ b/srslte/lib/fec/test/turbodecoder_test.c
@ -118,12 +118,10 @@ int main(int argc, char **argv) {
  float var[SNR_POINTS];
  uint32_t snr_points;
  uint32_t errors;
-  uint32_t errors_gen;
  uint32_t coded_length;
  struct timeval tdata[3];
-  float mean_usec, mean_usec_gen;
-  srslte_tdec_sse_t tdec;
-  srslte_tdec_gen_t tdec_gen;
+  float mean_usec;
+  srslte_tdec_t tdec;
  srslte_tcod_t tcod;
  
  parse_args(argc, argv);
@ -189,12 +187,7 @@ int main(int argc, char **argv) {
    exit(-1);
  }

-  if (srslte_tdec_sse_init(&tdec, frame_length)) {
-    fprintf(stderr, "Error initiating Turbo decoder\n");
-    exit(-1);
-  }
-
-  if (srslte_tdec_gen_init(&tdec_gen, frame_length)) {
+  if (srslte_tdec_init(&tdec, frame_length)) {
    fprintf(stderr, "Error initiating Turbo decoder\n");
    exit(-1);
  }
@ -216,9 +209,7 @@ int main(int argc, char **argv) {
  for (i = 0; i < snr_points; i++) {

    mean_usec = 0;
-    mean_usec_gen = 0;
    errors = 0; 
-    errors_gen = 0; 
    frame_cnt = 0;
    while (frame_cnt < nof_frames) {
      /* generate data_tx */
@ -249,8 +240,7 @@ int main(int argc, char **argv) {
        llr_s[j] = (int16_t) (100*llr[j]);
      }
      /* decoder */
-      srslte_tdec_sse_reset(&tdec, frame_length);
-      srslte_tdec_gen_reset(&tdec_gen, frame_length);
+      srslte_tdec_reset(&tdec, frame_length);

      uint32_t t;
      if (nof_iterations == -1) {
@ -261,7 +251,7 @@ int main(int argc, char **argv) {

      gettimeofday(&tdata[1], NULL); 
      for (int k=0;k<nof_repetitions;k++) {     
-        srslte_tdec_sse_run_all(&tdec, llr_s, data_rx_bytes, t, frame_length);        
+        srslte_tdec_run_all(&tdec, llr_s, data_rx_bytes, t, frame_length);        
      }
      gettimeofday(&tdata[2], NULL);
      get_time_interval(tdata);
@ -271,23 +261,10 @@ int main(int argc, char **argv) {

      errors += srslte_bit_diff(data_tx, data_rx, frame_length);
      
-      gettimeofday(&tdata[1], NULL); 
-      for (int k=0;k<nof_repetitions;k++) {     
-        srslte_tdec_gen_run_all(&tdec_gen, llr, data_rx, t, frame_length);
-      }
-      gettimeofday(&tdata[2], NULL);
-      get_time_interval(tdata);
-      mean_usec_gen = (float) mean_usec_gen * 0.9 + (float) (tdata[0].tv_usec/nof_repetitions) * 0.1;
-
-      /* check errors */
-      errors_gen += srslte_bit_diff(data_tx, data_rx, frame_length);
-      
      frame_cnt++;
      printf("Eb/No: %2.2f %10d/%d   ", SNR_MIN + i * ebno_inc, frame_cnt, nof_frames);
      printf("BER: %.2e  ", (float) errors / (frame_cnt * frame_length));
-      printf("BER_gen: %.2e  ", (float) errors_gen / (frame_cnt * frame_length));
-      printf("%3.1f Mbps (%6.2f usec) -- gen: ", (float) frame_length / mean_usec, mean_usec);
-      printf("%3.1f Mbps (%6.2f usec)", (float) frame_length / mean_usec_gen, mean_usec_gen);
+      printf("%3.1f Mbps (%6.2f usec)", (float) frame_length / mean_usec, mean_usec);
      printf("\r");

    }    
@ -297,10 +274,7 @@ int main(int argc, char **argv) {
  printf("\n");
  if (snr_points == 1) {
    if (errors) {
-      printf("%d Errors in SSE\n", errors);
-    }
-    if (errors_gen) {
-      printf("%d Errors in GEN\n", errors_gen);
+      printf("%d Errors\n", errors);
    }
  }    

@ -311,8 +285,7 @@ int main(int argc, char **argv) {
  free(llr_c);
  free(data_rx);

-  srslte_tdec_sse_free(&tdec);
-  srslte_tdec_gen_free(&tdec_gen);
+  srslte_tdec_free(&tdec);
  srslte_tcod_free(&tcod);

  printf("\n");
--- a/srslte/lib/mimo/src/precoding.c
+++ b/srslte/lib/mimo/src/precoding.c
@ -35,6 +35,18 @@
 #include "srslte/mimo/precoding.h"
 #include "srslte/utils/vector.h"

+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+int srslte_predecoding_single_sse(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate);
+#endif
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+int srslte_predecoding_single_avx(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate);
+#endif
+
+

 /************************************************
 * 
@ -117,23 +129,138 @@ void srslte_precoding_free(srslte_precoding_t *q) {
  bzero(q, sizeof(srslte_precoding_t));
 }

-/* ZF/MMSE SISO equalizer x=y(h'h+no)^(-1)h' (ZF if n0=0.0)*/
-int srslte_predecoding_single(srslte_precoding_t *q, cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
-  if (nof_symbols <= q->max_frame_len) {
-    // h'h
-    srslte_vec_abs_square_cf(h, q->y_mod, nof_symbols);
-    if (noise_estimate > 0.0) {
-      // (h'h + n0)
-      srslte_vec_sc_add_fff(q->y_mod, noise_estimate, q->y_mod, nof_symbols);      
-    }
-    // y*h'
-    srslte_vec_prod_conj_ccc(y, h, x, nof_symbols);
-    // divide by (h'h+no)
-    srslte_vec_div_cfc(x,q->y_mod,x,q->z_real,q->z_imag, nof_symbols);
+#ifdef LV_HAVE_SSE
+
+#define PROD(a,b) _mm_addsub_ps(_mm_mul_ps(a,_mm_moveldup_ps(b)),_mm_mul_ps(_mm_shuffle_ps(a,a,0xB1),_mm_movehdup_ps(b)))
+
+int srslte_predecoding_single_sse(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
+  
+  float *xPtr = (float*) x;
+  const float *hPtr = (const float*) h;
+  const float *yPtr = (const float*) y;
+
+  __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+  
+  __m128 noise = _mm_set1_ps(noise_estimate);
+  __m128 h1Val, h2Val, y1Val, y2Val, h12square, h1square, h2square, h1conj, h2conj, x1Val, x2Val;
+  for (int i=0;i<nof_symbols/4;i++) {
+    y1Val = _mm_load_ps(yPtr); yPtr+=4;
+    y2Val = _mm_load_ps(yPtr); yPtr+=4;
+    h1Val = _mm_load_ps(hPtr); hPtr+=4;
+    h2Val = _mm_load_ps(hPtr); hPtr+=4;
+    
+    h12square = _mm_hadd_ps(_mm_mul_ps(h1Val, h1Val), _mm_mul_ps(h2Val, h2Val)); 
+    if (noise_estimate > 0) {
+      h12square  = _mm_add_ps(h12square, noise);
+    }
+    
+    h1square  = _mm_shuffle_ps(h12square, h12square, _MM_SHUFFLE(1, 1, 0, 0));
+    h2square  = _mm_shuffle_ps(h12square, h12square, _MM_SHUFFLE(3, 3, 2, 2));
+    
+    /* Conjugate channel */
+    h1conj = _mm_xor_ps(h1Val, conjugator); 
+    h2conj = _mm_xor_ps(h2Val, conjugator); 
+
+    /* Complex product */      
+    x1Val = PROD(y1Val, h1conj);
+    x2Val = PROD(y2Val, h2conj);
+
+    x1Val = _mm_div_ps(x1Val, h1square);
+    x2Val = _mm_div_ps(x2Val, h2square);
+    
+    _mm_store_ps(xPtr, x1Val); xPtr+=4;
+    _mm_store_ps(xPtr, x2Val); xPtr+=4;
+  }
+  for (int i=8*(nof_symbols/8);i<nof_symbols;i++) {
+    x[i] = y[i]*conj(h[i])/(conj(h[i])*h[i]+noise_estimate);
+  }
  return nof_symbols;
+}
+
+#endif
+
+#ifdef LV_HAVE_AVX
+
+#define PROD_AVX(a,b) _mm256_addsub_ps(_mm256_mul_ps(a,_mm256_moveldup_ps(b)),_mm256_mul_ps(_mm256_shuffle_ps(a,a,0xB1),_mm256_movehdup_ps(b)))
+
+
+
+int srslte_predecoding_single_avx(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
+  
+  float *xPtr = (float*) x;
+  const float *hPtr = (const float*) h;
+  const float *yPtr = (const float*) y;
+
+  __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+  
+  __m256 noise = _mm256_set1_ps(noise_estimate);
+  __m256 h1Val, h2Val, y1Val, y2Val, h12square, h1square, h2square, h1_p, h2_p, h1conj, h2conj, x1Val, x2Val;
+  
+  for (int i=0;i<nof_symbols/8;i++) {
+    y1Val = _mm256_load_ps(yPtr); yPtr+=8;
+    y2Val = _mm256_load_ps(yPtr); yPtr+=8;
+    h1Val = _mm256_load_ps(hPtr); hPtr+=8;
+    h2Val = _mm256_load_ps(hPtr); hPtr+=8;
+    
+    __m256 t1 = _mm256_mul_ps(h1Val, h1Val);
+    __m256 t2 = _mm256_mul_ps(h2Val, h2Val);
+    h12square = _mm256_hadd_ps(_mm256_permute2f128_ps(t1, t2, 0x20), _mm256_permute2f128_ps(t1, t2, 0x31)); 
+    if (noise_estimate > 0) {
+      h12square  = _mm256_add_ps(h12square, noise);
+    }
+    h1_p     = _mm256_permute_ps(h12square, _MM_SHUFFLE(1, 1, 0, 0));
+    h2_p     = _mm256_permute_ps(h12square, _MM_SHUFFLE(3, 3, 2, 2));
+    h1square = _mm256_permute2f128_ps(h1_p, h2_p, 2<<4);
+    h2square = _mm256_permute2f128_ps(h1_p, h2_p, 3<<4 | 1);
+    
+    /* Conjugate channel */
+    h1conj = _mm256_xor_ps(h1Val, conjugator); 
+    h2conj = _mm256_xor_ps(h2Val, conjugator); 
+
+    /* Complex product */      
+    x1Val = PROD_AVX(y1Val, h1conj);
+    x2Val = PROD_AVX(y2Val, h2conj);
+
+    x1Val = _mm256_div_ps(x1Val, h1square);
+    x2Val = _mm256_div_ps(x2Val, h2square);
+    
+    _mm256_store_ps(xPtr, x1Val); xPtr+=8;
+    _mm256_store_ps(xPtr, x2Val); xPtr+=8;
+  }
+  for (int i=16*(nof_symbols/16);i<nof_symbols;i++) {
+    x[i] = y[i]*conj(h[i])/(conj(h[i])*h[i]+noise_estimate);
+  }
+  return nof_symbols;
+}
+
+#endif
+
+int srslte_predecoding_single_gen(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
+  for (int i=0;i<nof_symbols;i++) {
+    x[i] = y[i]*conj(h[i])/(conj(h[i])*h[i]+noise_estimate);
+  }
+  return nof_symbols;
+}
+
+/* ZF/MMSE SISO equalizer x=y(h'h+no)^(-1)h' (ZF if n0=0.0)*/
+int srslte_predecoding_single(cf_t *y, cf_t *h, cf_t *x, int nof_symbols, float noise_estimate) {
+#ifdef LV_HAVE_AVX
+  if (nof_symbols > 32) {
+    return srslte_predecoding_single_avx(y, h, x, nof_symbols, noise_estimate);
  } else {
-    return SRSLTE_ERROR; 
+    return srslte_predecoding_single_gen(y, h, x, nof_symbols, noise_estimate);
+  }
+#else
+  #ifdef LV_HAVE_SSE
+    if (nof_symbols > 32) {
+      return srslte_predecoding_single_sse(y, h, x, nof_symbols, noise_estimate);
+    } else {
+      return srslte_predecoding_single_gen(y, h, x, nof_symbols, noise_estimate);      
    }
+  #else
+    return srslte_predecoding_single_gen(y, h, x, nof_symbols, noise_estimate);
+  #endif
+#endif
 }

 /* ZF/MMSE STBC equalizer x=y(H'H+n0·I)^(-1)H' (ZF is n0=0.0) 
@ -257,7 +384,7 @@ int srslte_predecoding_type(srslte_precoding_t *q, cf_t *y, cf_t *h[SRSLTE_MAX_P
  switch (type) {
  case SRSLTE_MIMO_TYPE_SINGLE_ANTENNA:
    if (nof_ports == 1 && nof_layers == 1) {
-      return srslte_predecoding_single(q, y, h[0], x[0], nof_symbols, noise_estimate);              
+      return srslte_predecoding_single(y, h[0], x[0], nof_symbols, noise_estimate);              
    } else {
      fprintf(stderr,
          "Number of ports and layers must be 1 for transmission on single antenna ports\n");
--- a/srslte/lib/mimo/test/precoding_test.c
+++ b/srslte/lib/mimo/test/precoding_test.c
@ -102,7 +102,7 @@ int main(int argc, char **argv) {
      perror("srslte_vec_malloc");
      exit(-1);
    }
-    xr[i] = calloc(1,sizeof(cf_t) * nof_symbols);
+    xr[i] = srslte_vec_malloc(sizeof(cf_t) * nof_symbols);
    if (!xr[i]) {
      perror("srslte_vec_malloc");
      exit(-1);
@ -186,7 +186,6 @@ int main(int argc, char **argv) {
  mse = 0;
  for (i = 0; i < nof_layers; i++) {
    for (j = 0; j < nof_symbols; j++) {
-      printf("%f - %f\n", crealf(xr[i][j]), crealf(x[i][j]));
      mse += cabsf(xr[i][j] - x[i][j]);
    }
  }
--- a/srslte/lib/modem/src/demod_soft.c
+++ b/srslte/lib/modem/src/demod_soft.c
@ -33,16 +33,16 @@
 #include "srslte/utils/bit.h"
 #include "srslte/modem/demod_soft.h"

-#define HAVE_SIMD
+// AVX implementation not useful for integers. Wait for AVX2

-#ifdef HAVE_SIMD
+#ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
+#include <pmmintrin.h>
 #include <tmmintrin.h>
+void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols);
 #endif


-//#define SCALE_DEMOD16QAM
-
 #define SCALE_SHORT_CONV_QPSK  100
 #define SCALE_SHORT_CONV_QAM16 400
 #define SCALE_SHORT_CONV_QAM64 700
@ -72,47 +72,16 @@ void demod_16qam_lte(const cf_t *symbols, float *llr, int nsymbols) {
    float yre = crealf(symbols[i]);
    float yim = cimagf(symbols[i]);
    
-#ifdef SCALE_DEMOD16QAM
-
-    llr[4*i+2] = (fabsf(yre)-2/sqrt(10))*sqrt(10);
-    llr[4*i+3] = (fabsf(yim)-2/sqrt(10))*sqrt(10);    
-
-    if (llr[4*i+2] > 0) {
-      llr[4*i+0] = -yre/(3/sqrt(10));
-    } else {
-      llr[4*i+0] = -yre/(1/sqrt(10));
-    }
-    if (llr[4*i+3] > 0) {
-      llr[4*i+1] = -yim/(3/sqrt(10));
-    } else {
-      llr[4*i+1] = -yim/(1/sqrt(10));
-    }    
-
-#else
-    
    llr[4*i+0] = -yre;
    llr[4*i+1] = -yim;
    llr[4*i+2] = fabsf(yre)-2/sqrt(10);
    llr[4*i+3] = fabsf(yim)-2/sqrt(10);
-
-#endif
-    
  }
 }

-void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
-#ifndef HAVE_SIMD
-  for (int i=0;i<nsymbols;i++) {
-    short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
-    short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i]));
-        
-    llr[4*i+0] = -yre;
-    llr[4*i+1] = -yim;
-    llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
-    llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);    
-  }
-#else
+#ifdef LV_HAVE_SSE

+void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) {
    float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2; 
@ -148,6 +117,22 @@ void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
    short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
    short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i]));
        
+    llr[4*i+0] = -yre;
+    llr[4*i+1] = -yim;
+    llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
+    llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);    
+  }
+}
+#endif
+
+void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
+#ifdef LV_HAVE_SSE
+  demod_16qam_lte_s_sse(symbols, llr, nsymbols);
+#else
+  for (int i=0;i<nsymbols;i++) {
+    short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
+    short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i]));
+        
    llr[4*i+0] = -yre;
    llr[4*i+1] = -yim;
    llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
@ -172,21 +157,10 @@ void demod_64qam_lte(const cf_t *symbols, float *llr, int nsymbols)
  
 }

-void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) 
-{
-#ifndef HAVE_SIMD
-  for (int i=0;i<nsymbols;i++) {
-    float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i]));
-    float yim = (short) (SCALE_SHORT_CONV*cimagf(symbols[i]));
+#ifdef LV_HAVE_SSE

-    llr[6*i+0] = -yre;
-    llr[6*i+1] = -yim;
-    llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
-    llr[6*i+3] = abs(yim)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
-    llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);
-    llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);        
-  }
-#else
+void demod_64qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) 
+{
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2; 
@ -239,6 +213,26 @@ void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols)
    float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i]));
    float yim = (short) (SCALE_SHORT_CONV_QAM64*cimagf(symbols[i]));

+    llr[6*i+0] = -yre;
+    llr[6*i+1] = -yim;
+    llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
+    llr[6*i+3] = abs(yim)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
+    llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);
+    llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);        
+  }
+}
+  
+#endif
+
+void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) 
+{
+#ifdef LV_HAVE_SSE
+  demod_64qam_lte_s_sse(symbols, llr, nsymbols);
+#else
+  for (int i=0;i<nsymbols;i++) {
+    float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i]));
+    float yim = (short) (SCALE_SHORT_CONV_QAM64*cimagf(symbols[i]));
+
    llr[6*i+0] = -yre;
    llr[6*i+1] = -yim;
    llr[6*i+2] = abs(yre)-4*SCALE_SHORT_CONV_QAM64/sqrt(42);
--- a/srslte/lib/phch/src/pbch.c
+++ b/srslte/lib/phch/src/pbch.c
@ -470,8 +470,7 @@ int srslte_pbch_decode(srslte_pbch_t *q, cf_t *slot1_symbols, cf_t *ce_slot1[SRS
        /* in control channels, only diversity is supported */
        if (nant == 1) {
          /* no need for layer demapping */
-          srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d,
-              q->nof_symbols, noise_estimate);
+          srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, q->nof_symbols, noise_estimate);
        } else {
          srslte_predecoding_diversity(&q->precoding, q->symbols[0], q->ce, x, nant,
              q->nof_symbols, noise_estimate);
--- a/srslte/lib/phch/src/pcfich.c
+++ b/srslte/lib/phch/src/pcfich.c
@ -193,8 +193,7 @@ int srslte_pcfich_decode(srslte_pcfich_t *q, cf_t *slot_symbols, cf_t *ce[SRSLTE
    /* in control channels, only diversity is supported */
    if (q->cell.nof_ports == 1) {
      /* no need for layer demapping */
-      srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d,
-          q->nof_symbols, noise_estimate);
+      srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, q->nof_symbols, noise_estimate);
    } else {
      srslte_predecoding_diversity(&q->precoding, q->symbols[0], ce_precoding, x,
          q->cell.nof_ports, q->nof_symbols, noise_estimate);
--- a/srslte/lib/phch/src/pdcch.c
+++ b/srslte/lib/phch/src/pdcch.c
@ -408,7 +408,7 @@ int srslte_pdcch_extract_llr(srslte_pdcch_t *q, cf_t *sf_symbols, cf_t *ce[SRSLT
    /* in control channels, only diversity is supported */
    if (q->cell.nof_ports == 1) {
      /* no need for layer demapping */
-      srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d, nof_symbols, noise_estimate);
+      srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, nof_symbols, noise_estimate);
    } else {
      srslte_predecoding_diversity(&q->precoding, q->symbols[0], q->ce, x, q->cell.nof_ports, nof_symbols, noise_estimate);
      srslte_layerdemap_diversity(x, q->d, q->cell.nof_ports, nof_symbols / q->cell.nof_ports);
--- a/srslte/lib/phch/src/pdsch.c
+++ b/srslte/lib/phch/src/pdsch.c
@ -404,8 +404,7 @@ int srslte_pdsch_decode_rnti(srslte_pdsch_t *q,
    /* TODO: only diversity is supported */
    if (q->cell.nof_ports == 1) {
      /* no need for layer demapping */
-      srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d,
-          cfg->nbits.nof_re, noise_estimate);
+      srslte_predecoding_single(q->symbols[0], q->ce[0], q->d, cfg->nbits.nof_re, noise_estimate);
    } else {
      srslte_predecoding_diversity(&q->precoding, q->symbols[0], q->ce, x, q->cell.nof_ports,
          cfg->nbits.nof_re, noise_estimate);
--- a/srslte/lib/phch/src/phich.c
+++ b/srslte/lib/phch/src/phich.c
@ -216,8 +216,7 @@ int srslte_phich_decode(srslte_phich_t *q, cf_t *slot_symbols, cf_t *ce[SRSLTE_M
  /* in control channels, only diversity is supported */
  if (q->cell.nof_ports == 1) {
    /* no need for layer demapping */
-    srslte_predecoding_single(&q->precoding, q->symbols[0], q->ce[0], q->d0,
-    SRSLTE_PHICH_MAX_NSYMB, noise_estimate);
+    srslte_predecoding_single(q->symbols[0], q->ce[0], q->d0, SRSLTE_PHICH_MAX_NSYMB, noise_estimate);
  } else {
    srslte_predecoding_diversity(&q->precoding, q->symbols[0], ce_precoding, x,
        q->cell.nof_ports, SRSLTE_PHICH_MAX_NSYMB, noise_estimate);
--- a/srslte/lib/phch/src/pusch.c
+++ b/srslte/lib/phch/src/pusch.c
@ -438,8 +438,7 @@ int srslte_pusch_decode(srslte_pusch_t *q,
        return SRSLTE_ERROR;
      }
      
-      srslte_predecoding_single(&q->equalizer, q->d, q->ce, q->z,
-            cfg->nbits.nof_re, noise_estimate);
+      srslte_predecoding_single(q->d, q->ce, q->z, cfg->nbits.nof_re, noise_estimate);

      srslte_dft_predecoding(&q->dft_precoding, q->z, q->d, cfg->grant.L_prb, cfg->nbits.nof_symb);
      
--- a/srslte/lib/phch/src/sch.c
+++ b/srslte/lib/phch/src/sch.c
@ -111,7 +111,7 @@ int srslte_sch_init(srslte_sch_t *q) {
      fprintf(stderr, "Error initiating Turbo Coder\n");
      goto clean;
    }
-    if (srslte_tdec_sse_init(&q->decoder, SRSLTE_TCOD_MAX_LEN_CB)) {
+    if (srslte_tdec_init(&q->decoder, SRSLTE_TCOD_MAX_LEN_CB)) {
      fprintf(stderr, "Error initiating Turbo Decoder\n");
      goto clean;
    }
@ -133,7 +133,7 @@ int srslte_sch_init(srslte_sch_t *q) {
      goto clean; 
    }
    bzero(q->temp_g_bits, SRSLTE_MAX_PRB*12*12*12);
-    q->ul_interleaver = srslte_vec_malloc(sizeof(uint32_t)*SRSLTE_MAX_PRB*12*12*12);
+    q->ul_interleaver = srslte_vec_malloc(sizeof(uint16_t)*SRSLTE_MAX_PRB*12*12*12);
    if (!q->ul_interleaver) {
      goto clean; 
    }
@ -163,7 +163,7 @@ void srslte_sch_free(srslte_sch_t *q) {
  if (q->ul_interleaver) {
    free(q->ul_interleaver);
  }
-  srslte_tdec_sse_free(&q->decoder);
+  srslte_tdec_free(&q->decoder);
  srslte_tcod_free(&q->encoder);
  srslte_uci_cqi_free(&q->uci_cqi);
  bzero(q, sizeof(srslte_sch_t));
@ -413,10 +413,10 @@ static int decode_tb(srslte_sch_t *q,
      srslte_crc_t *crc_ptr; 
      early_stop = false; 

-      srslte_tdec_sse_reset(&q->decoder, cb_len);
+      srslte_tdec_reset(&q->decoder, cb_len);
            
      do {
-        srslte_tdec_sse_iteration(&q->decoder, softbuffer->buffer_f[i], cb_len); 
+        srslte_tdec_iteration(&q->decoder, softbuffer->buffer_f[i], cb_len); 
        q->nof_iterations++;
        
        if (cb_segm->C > 1) {
@ -427,10 +427,10 @@ static int decode_tb(srslte_sch_t *q,
          crc_ptr = &q->crc_tb; 
        }

-        srslte_tdec_sse_decision_byte(&q->decoder, q->cb_in, cb_len);
+        srslte_tdec_decision_byte(&q->decoder, q->cb_in, cb_len);
                 
        if (i == 9) {
-          srslte_tdec_sse_decision(&q->decoder, q->temp_data, cb_len);
+          srslte_tdec_decision(&q->decoder, q->temp_data, cb_len);
        }
        /* Check Codeblock CRC and stop early if incorrect */
        if (!srslte_crc_checksum_byte(crc_ptr, q->cb_in, len_crc)) {
@ -525,7 +525,7 @@ int srslte_ulsch_decode(srslte_sch_t *q, srslte_pusch_cfg_t *cfg, srslte_softbuf
 /* UL-SCH channel interleaver according to 5.2.2.8 of 36.212 */
 void ulsch_interleave(uint8_t *g_bits, uint32_t Qm, uint32_t H_prime_total, 
                      uint32_t N_pusch_symbs, uint8_t *q_bits, srslte_uci_bit_t *ri_bits, uint32_t nof_ri_bits, 
-                      uint32_t *interleaver_buffer, uint8_t *temp_buffer, uint32_t buffer_sz) 
+                      uint16_t *interleaver_buffer, uint8_t *temp_buffer, uint32_t buffer_sz) 
 {
  
  uint32_t rows = H_prime_total/N_pusch_symbs;
--- a/srslte/lib/phch/test/pdsch_test.c
+++ b/srslte/lib/phch/test/pdsch_test.c
@ -147,24 +147,24 @@ int main(int argc, char **argv) {

  /* init memory */
  for (i=0;i<cell.nof_ports;i++) {
-    ce[i] = malloc(sizeof(cf_t) * SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp));
+    ce[i] = srslte_vec_malloc(sizeof(cf_t) * SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp));
    if (!ce[i]) {
-      perror("malloc");
+      perror("srslte_vec_malloc");
      goto quit;
    }
    for (j=0;j<SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp);j++) {
      ce[i][j] = 1;
    }
-    slot_symbols[i] = calloc(sizeof(cf_t) , SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp));
+    slot_symbols[i] = srslte_vec_malloc(sizeof(cf_t)*SRSLTE_SF_LEN_RE(cell.nof_prb, cell.cp));
    if (!slot_symbols[i]) {
-      perror("malloc");
+      perror("srslte_vec_malloc");
      goto quit;
    }
  }
  
  data = srslte_vec_malloc(sizeof(uint8_t) * grant.mcs.tbs/8);
  if (!data) {
-    perror("malloc");
+    perror("srslte_vec_malloc");
    goto quit;
  }

--- a/srslte/lib/utils/src/vector.c
+++ b/srslte/lib/utils/src/vector.c
@ -240,6 +240,17 @@ void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len) {
 #endif
 }

+void srslte_vec_convert_if(int16_t *x, float *z, float scale, uint32_t len) {
+#ifndef HAVE_VOLK_CONVERT_IF_FUNCTION
+  int i;
+  for (i=0;i<len;i++) {
+    z[i] = ((float) x[i])*scale;
+  }
+#else
+  volk_16i_s32f_convert_32f(z,x,scale,len);
+#endif  
+}
+
 void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len) {
 #ifndef HAVE_VECTOR_SIMD
  int i;
@ -303,7 +314,12 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {

 void *srslte_vec_malloc(uint32_t size) {
 #ifndef HAVE_VOLK
-  return malloc(size);
+  void *ptr;
+  if (posix_memalign(&ptr,32,size)) {
+    return NULL;
+  } else {
+    return ptr;
+  }
 #else
  void *ptr;
  if (posix_memalign(&ptr,volk_get_alignment(),size)) {
--- a/srslte/lib/utils/src/vector_simd.c
+++ b/srslte/lib/utils/src/vector_simd.c
@ -37,18 +37,15 @@
 #include <inttypes.h>
 #include <stdio.h>

-#include <xmmintrin.h>
+#ifdef LV_HAVE_SSE
+#include <emmintrin.h>
+#include <nmmintrin.h>
+#endif

-void print128_num(__m128i var)
-{
-    int16_t *val = (int16_t*) &var;//can also use uint32_t instead of 16_t
-    printf("Numerical: %d %d %d %d %d %d %d %d \n", 
-           val[0], val[1], val[2], val[3], val[4], val[5], 
-           val[6], val[7]);
-}

 void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len)
 {
+#ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 8;

@ -75,10 +72,13 @@ void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len)
  for(;number < len; number++){
    z[number] = x[number] + y[number];
  }
+#endif
+
 }

 void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len)
 {
+#ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 8;

@ -105,10 +105,12 @@ void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len)
  for(;number < len; number++){
    z[number] = x[number] - y[number];
  }
+#endif
 }

 void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len)
 {
+#ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 8;

@ -135,10 +137,12 @@ void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len)
  for(;number < len; number++){
    z[number] = x[number] * y[number];
  }
+#endif
 }

 void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len)
 {
+#ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 8;

@ -163,10 +167,13 @@ void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len)
  for(;number < len; number++){
    z[number] = x[number] / divn;
  }
+#endif
 }

+/* No improvement with AVX */
 void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t len)
 {
+#ifdef LV_HAVE_SSE
  unsigned int number = 0;
  const unsigned int points = len / 8;

@ -192,12 +199,13 @@ void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, uint32_t l
  for(;number < len; number++){
    y[lut[number]] = x[number];
  }
-  
+#endif  
 }

-/* Modified from volk_32f_s32f_convert_16i_a_sse2. Removed clipping */
+/* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */
 void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len)
 {
+#ifdef LV_HAVE_SSE
  unsigned int number = 0;

  const unsigned int eighthPoints = len / 8;
@ -230,5 +238,5 @@ void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len)
  for(; number < len; number++){
    z[number] = (int16_t) (x[number] * scale);
  }
-
+#endif
 }