Turbo decoder working OK @ 100 Mbps

9 years ago · ad06998d91
parent bb56d4895d
commit ad06998d91
14 changed files with 1286 additions and 293 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -84,7 +84,7 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
 ENDIF(CMAKE_COMPILER_IS_GNUCXX)

 IF(CMAKE_COMPILER_IS_GNUCC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-write-strings -Wno-format-extra-args -Winline -Wno-unused-result -Wno-format -std=c99 -D_GNU_SOURCE -g -mfpmath=sse -mavx -O3")
   # IF(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
   #   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -Wno-error=implicit-function-declaration -Wno-error=unused-but-set-variable")
   # ENDIF(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
--- a/srslte/include/srslte/fec/tc_interl.h
+++ b/srslte/include/srslte/fec/tc_interl.h
@ -40,8 +40,8 @@
 #include <stdint.h>

 typedef struct SRSLTE_API {
-  uint32_t *forward;
-  uint32_t *reverse;
+  uint16_t *forward;
+  uint16_t *reverse;
  uint32_t max_long_cb;
 } srslte_tc_interl_t;

--- a/srslte/include/srslte/fec/turbodecoder.h
+++ b/srslte/include/srslte/fec/turbodecoder.h
@ -50,11 +50,12 @@
 #define SRSLTE_TCOD_MAX_LEN_CB     6144
 #define SRSLTE_TCOD_MAX_LEN_CODED  (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL)

-typedef float srslte_llr_t;
+typedef short llr_t;

 typedef struct SRSLTE_API {
  int max_long_cb;
-  srslte_llr_t *beta;
+  llr_t *alpha;
+  llr_t *branch;
 } srslte_map_gen_t;

 typedef struct SRSLTE_API {
@ -62,14 +63,17 @@ typedef struct SRSLTE_API {

  srslte_map_gen_t dec;

-  srslte_llr_t *llr1;
-  srslte_llr_t *llr2;
-  srslte_llr_t *w;
-  srslte_llr_t *syst;
-  srslte_llr_t *parity;
+  llr_t *app1;
+  llr_t *app2;
+  llr_t *ext1;
+  llr_t *ext2;
+  llr_t *syst;
+  llr_t *parity0;
+  llr_t *parity1;
  
  int current_cbidx; 
  srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
+  int n_iter;
 } srslte_tdec_t;

 SRSLTE_API int srslte_tdec_init(srslte_tdec_t * h, 
@ -80,7 +84,7 @@ SRSLTE_API void srslte_tdec_free(srslte_tdec_t * h);
 SRSLTE_API int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb);

 SRSLTE_API void srslte_tdec_iteration(srslte_tdec_t * h, 
-                                      srslte_llr_t * input, 
+                                      float * input, 
                                      uint32_t long_cb);

 SRSLTE_API void srslte_tdec_decision(srslte_tdec_t * h, 
@ -92,7 +96,7 @@ SRSLTE_API void srslte_tdec_decision_byte(srslte_tdec_t * h,
                                          uint32_t long_cb); 

 SRSLTE_API int srslte_tdec_run_all(srslte_tdec_t * h, 
-                                   srslte_llr_t * input, 
+                                   float * input, 
                                   uint8_t *output,
                                   uint32_t nof_iterations, 
                                   uint32_t long_cb);
--- a/srslte/include/srslte/fec/turbodecoder_vl.h
+++ b/srslte/include/srslte/fec/turbodecoder_vl.h
@ -0,0 +1,100 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 The srsLTE Developers. See the
+ * COPYRIGHT file at the top-level directory of this distribution.
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+/**********************************************************************************************
+ *  File:         turbodecoder.h
+ *
+ *  Description:  Turbo Decoder.
+ *                Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
+ *                encoders and one turbo code internal interleaver. The coding rate of turbo
+ *                encoder is 1/3.
+ *                MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
+ *
+ *  Reference:    3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
+ *********************************************************************************************/
+
+#ifndef TURBODECODER_VL_
+#define TURBODECODER_VL_
+
+#include "srslte/config.h"
+#include "srslte/fec/tc_interl.h"
+#include "srslte/fec/cbsegm.h"
+
+#define SRSLTE_TCOD_RATE 3
+#define SRSLTE_TCOD_TOTALTAIL 12
+
+#define SRSLTE_TCOD_MAX_LEN_CB     6144
+#define SRSLTE_TCOD_MAX_LEN_CODED  (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL)
+
+typedef float srslte_llr_t;
+
+typedef struct SRSLTE_API {
+  int max_long_cb;
+  srslte_llr_t *beta;
+} srslte_map_gen_vl_t;
+
+typedef struct SRSLTE_API {
+  int max_long_cb;
+
+  srslte_map_gen_vl_t dec;
+
+  srslte_llr_t *llr1;
+  srslte_llr_t *llr2;
+  srslte_llr_t *w;
+  srslte_llr_t *syst;
+  srslte_llr_t *parity;
+
+  int current_cbidx; 
+  srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
+} srslte_tdec_vl_t;
+
+SRSLTE_API int srslte_tdec_vl_init(srslte_tdec_vl_t * h, 
+                                uint32_t max_long_cb);
+
+SRSLTE_API void srslte_tdec_vl_free(srslte_tdec_vl_t * h);
+
+SRSLTE_API int srslte_tdec_vl_reset(srslte_tdec_vl_t * h, uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_vl_iteration(srslte_tdec_vl_t * h, 
+                                      srslte_llr_t * input, 
+                                      uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_vl_decision(srslte_tdec_vl_t * h, 
+                                     uint8_t *output, 
+                                     uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_vl_decision_byte(srslte_tdec_vl_t * h, 
+                                          uint8_t *output, 
+                                          uint32_t long_cb); 
+
+SRSLTE_API int srslte_tdec_vl_run_all(srslte_tdec_vl_t * h, 
+                                   srslte_llr_t * input, 
+                                   uint8_t *output,
+                                   uint32_t nof_iterations, 
+                                   uint32_t long_cb);
+
+#endif
--- a/srslte/include/srslte/utils/vector.h
+++ b/srslte/include/srslte/utils/vector.h
@ -69,6 +69,7 @@ SRSLTE_API void srslte_vec_fprint_f(FILE *stream, float *x, uint32_t len);
 SRSLTE_API void srslte_vec_fprint_b(FILE *stream, uint8_t *x, uint32_t len);
 SRSLTE_API void srslte_vec_fprint_byte(FILE *stream, uint8_t *x, uint32_t len);
 SRSLTE_API void srslte_vec_fprint_i(FILE *stream, int *x, uint32_t len);
+SRSLTE_API void srslte_vec_fprint_s(FILE *stream, short *x, uint32_t len); 
 SRSLTE_API void srslte_vec_fprint_hex(FILE *stream, uint8_t *x, uint32_t len);

 /* Saves/loads a vector to a file */
@ -79,6 +80,8 @@ SRSLTE_API void srslte_vec_load_file(char *filename, void *buffer, uint32_t len)
 SRSLTE_API void srslte_vec_sum_ch(uint8_t *x, uint8_t *y, char *z, uint32_t len);
 SRSLTE_API void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len);
 SRSLTE_API void srslte_vec_sum_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len);
+SRSLTE_API void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len);
+SRSLTE_API void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len);

 /* substract two vectors z=x-y */
 SRSLTE_API void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len); 
@ -99,12 +102,18 @@ SRSLTE_API void srslte_vec_sc_add_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_sc_prod_cfc(cf_t *x, float h, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_sc_prod_ccc(cf_t *x, cf_t h, cf_t *z, uint32_t len);
 SRSLTE_API void srslte_vec_sc_prod_fff(float *x, float h, float *z, uint32_t len); 
+SRSLTE_API void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len); 
+SRSLTE_API void srslte_vec_sc_div2_sss(short *x, int pow2_div, short *z, uint32_t len); 

 /* Normalization */
 SRSLTE_API void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len);

 SRSLTE_API void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len);

+
+SRSLTE_API void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len);
+SRSLTE_API void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len); 
+
 SRSLTE_API void srslte_vec_deinterleave_cf(cf_t *x, float *real, float *imag, uint32_t len); 
 SRSLTE_API void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len);

--- a/srslte/include/srslte/utils/vector_simd.h
+++ b/srslte/include/srslte/utils/vector_simd.h
@ -0,0 +1,50 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 The srsLTE Developers. See the
+ * COPYRIGHT file at the top-level directory of this distribution.
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+#ifndef VECTORSIMD_
+#define VECTORSIMD_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+#include "srslte/config.h"
+
+
+SRSLTE_API void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len);
+
+SRSLTE_API void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len); 
+
+SRSLTE_API void srslte_vec_sc_div2_sss_simd(short *x, int n_rightshift, short *z, uint32_t len); 
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/srslte/lib/fec/src/tc_interl_umts.c
+++ b/srslte/lib/fec/src/tc_interl_umts.c
@ -89,7 +89,7 @@ int srslte_tc_interl_UMTS_gen(srslte_tc_interl_t *h, uint32_t long_cb) {
  uint32_t i, j;
  uint32_t res, prim, aux;
  uint32_t kp, k;
-  uint32_t *per, *desper;
+  uint16_t *per, *desper;
  uint8_t v;
  uint16_t p;
  uint16_t s[MAX_COLS], q[MAX_ROWS], r[MAX_ROWS], T[MAX_ROWS];
--- a/srslte/lib/fec/src/turbodecoder.c
+++ b/srslte/lib/fec/src/turbodecoder.c
@ -35,139 +35,272 @@
 #include "srslte/fec/turbodecoder.h"
 #include "srslte/utils/vector.h"

+#include <inttypes.h>
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
 #define NUMSTATES       8
 #define NINPUTS         2
 #define TAIL            3
 #define TOTALTAIL       12

-#define INF 9e4
-#define ZERO 9e-4
+#define INF 10000
+#define ZERO 0
+#define SCALE 100

-/************************************************
- *
- *  MAP_GEN is the MAX-LOG-MAP generic implementation of the
- *  Decoder
- *
- ************************************************/
-void srslte_map_gen_beta(srslte_map_gen_t * s, srslte_llr_t * input, srslte_llr_t * parity,
-                  uint32_t long_cb)
+static void print128_num(__m128i var)
 {
-  srslte_llr_t m_b[8], new[8], old[8];
-  srslte_llr_t x, y, xy;
-  int k;
-  uint32_t end = long_cb + SRSLTE_TCOD_RATE;
-  srslte_llr_t *beta = s->beta;
-  uint32_t i;
-
-  for (i = 0; i < 8; i++) {
-    old[i] = beta[8 * (end) + i];
-  }
+    int16_t *val = (int16_t*) &var;//can also use uint32_t instead of 16_t
+    printf("[%d %d %d %d %d %d %d %d]\n", 
+           val[0], val[1], val[2], val[3], val[4], val[5], 
+           val[6], val[7]);
+}

-  for (k = end - 1; k >= 0; k--) {
-    x = input[k];
-    y = parity[k];
+void print128f_num(__m128 var)
+{
+    float *val = (float*) &var;
+    printf("[%f %f %f %f]\n", 
+           val[0], val[1], val[2], val[3]);
+}

-    xy = x + y;

-    m_b[0] = old[4] + xy;
-    m_b[1] = old[4];
-    m_b[2] = old[5] + y;
-    m_b[3] = old[5] + x;
-    m_b[4] = old[6] + x;
-    m_b[5] = old[6] + y;
-    m_b[6] = old[7];
-    m_b[7] = old[7] + xy;
+/************************************************
+ *
+ *  MAP_GEN is the MAX-LOG-MAP generic implementation 
+ *
+ ************************************************/

-    new[0] = old[0];
-    new[1] = old[0] + xy;
-    new[2] = old[1] + x;
-    new[3] = old[1] + y;
-    new[4] = old[2] + y;
-    new[5] = old[2] + x;
-    new[6] = old[3] + xy;
-    new[7] = old[3];
+static inline int16_t hMax(__m128i buffer)
+{
+    __m128i tmp1 = _mm_sub_epi8(_mm_set1_epi16(0x7FFF), buffer);
+    __m128i tmp3 = _mm_minpos_epu16(tmp1);
+    return (int16_t)(_mm_cvtsi128_si32(tmp3));
+}

-    for (i = 0; i < 8; i++) {
-      if (m_b[i] > new[i])
-        new[i] = m_b[i];
-      beta[8 * k + i] = new[i];
-      old[i] = new[i];
-    }
+void srslte_map_gen_beta(srslte_map_gen_t * s, llr_t * output, uint32_t long_cb)
+{
+  int k;
+  uint32_t end = long_cb + 3;
+  const __m128i *alphaPtr = (const __m128i*) s->alpha;
+ 
+  __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
+  __m128i g, bp, bn, alpha_k; 
+  
+  __m128i shuf_bp = _mm_set_epi8(
+    15, 14, // 7
+    7,  6,  // 3
+    5,  4,  // 2
+    13, 12, // 6
+    11, 10, // 5
+    3,  2,  // 1
+    1,  0,  // 0
+    9,  8   // 4
+  );
+
+  __m128i shuf_bn = _mm_set_epi8(
+    7,   6, // 3
+    15, 14, // 7
+    13, 12, // 6
+    5,  4,  // 2
+    3,  2,  // 1
+    11, 10, // 5
+    9,  8,  // 4
+    1,  0   // 0
+  );
+ 
+  alphaPtr += long_cb-1;
+
+    __m128i shuf_g[4];
+  shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
+  shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
+  shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
+  shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
+  __m128i gv;
+  llr_t *b = &s->branch[2*long_cb-8];
+  __m128i *gPtr = (__m128i*) b;
+  __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
+  
+#define BETA_STEP(g)     bp = _mm_add_epi16(beta_k, g);\
+    bn = _mm_sub_epi16(beta_k, g);\
+    bp = _mm_shuffle_epi8(bp, shuf_bp);\
+    bn = _mm_shuffle_epi8(bn, shuf_bn);\
+    beta_k = _mm_max_epi16(bp, bn);    
+
+#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\
+    BETA_STEP(g)\
+    alpha_k = _mm_load_si128(alphaPtr);\
+    alphaPtr--;\
+    bp = _mm_add_epi16(bp, alpha_k);\
+    bn = _mm_add_epi16(bn, alpha_k); output[k-d] = hMax(bn) - hMax(bp);
+  
+  for (k=end-1; k>=long_cb; k--) {
+    llr_t g0 = s->branch[2*k];
+    llr_t g1 = s->branch[2*k+1];
+    g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1);
+  
+    BETA_STEP(g);
+  }  
+  
+  for (; k >= 0; k-=8) {    
+    gv = _mm_load_si128(gPtr);
+    gPtr--;
+    BETA_STEP_CNT(0,0);
+    BETA_STEP_CNT(1,1);
+    BETA_STEP_CNT(2,2);
+    BETA_STEP_CNT(3,3);
+    gv = _mm_load_si128(gPtr);
+    gPtr--;
+    BETA_STEP_CNT(0,4);
+    BETA_STEP_CNT(1,5);
+    BETA_STEP_CNT(2,6);
+    BETA_STEP_CNT(3,7);
+  __m128i norm = _mm_shuffle_epi8(beta_k, shuf_norm); 
+    beta_k = _mm_sub_epi16(beta_k, norm);
  }  
 }

-void srslte_map_gen_alpha(srslte_map_gen_t * s, srslte_llr_t * input, srslte_llr_t * parity, srslte_llr_t * output,
-                   uint32_t long_cb)
+void srslte_map_gen_alpha(srslte_map_gen_t * s, uint32_t long_cb)
 {
-  srslte_llr_t m_b[8], new[8], old[8], max1[8], max0[8];
-  srslte_llr_t m1, m0;
-  srslte_llr_t x, y, xy;
-  srslte_llr_t out;
  uint32_t k;
-  uint32_t end = long_cb;
-  srslte_llr_t *beta = s->beta;
+  llr_t *alpha = s->alpha;
  uint32_t i;

-  old[0] = 0;
+  alpha[0] = 0; 
  for (i = 1; i < 8; i++) {
-    old[i] = -INF;
+    alpha[i] = -INF;
+  }
+  
+  __m128i shuf_ap = _mm_set_epi8(
+    15, 14, // 7
+    9,  8,  // 4
+    7,  6,  // 3
+    1,  0,  // 0
+    13, 12, // 6
+    11, 10, // 5
+    5,  4,  // 2
+    3,  2   // 1
+  );
+
+  __m128i shuf_an = _mm_set_epi8(
+    13, 12, // 6
+    11, 10, // 5
+    5,  4,  // 2
+    3,  2,  // 1
+    15, 14, // 7
+    9,  8,  // 4
+    7,  6,  // 3
+    1,  0   // 0
+  );
+  
+  __m128i shuf_g[4];
+  shuf_g[0] = _mm_set_epi8(3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2);
+  shuf_g[1] = _mm_set_epi8(7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6);
+  shuf_g[2] = _mm_set_epi8(11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10);
+  shuf_g[3] = _mm_set_epi8(15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14);
+
+  __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
+  
+  __m128i* alphaPtr = (__m128i*) alpha;
+  alphaPtr++;
+
+  __m128i gv; 
+  __m128i *gPtr = (__m128i*) s->branch;
+  __m128i g, ap, an; 
+  
+  __m128i alpha_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
+  
+#define ALPHA_STEP(c)  g = _mm_shuffle_epi8(gv, shuf_g[c]); \
+  ap = _mm_add_epi16(alpha_k, g);\
+  an = _mm_sub_epi16(alpha_k, g);\
+  ap = _mm_shuffle_epi8(ap, shuf_ap);\
+  an = _mm_shuffle_epi8(an, shuf_an);\
+  alpha_k = _mm_max_epi16(ap, an);\
+  _mm_store_si128(alphaPtr, alpha_k);\
+  alphaPtr++;    \
+  
+  for (k = 0; k < long_cb/8; k++) {
+    gv = _mm_load_si128(gPtr);
+    gPtr++;
+    ALPHA_STEP(0);
+    ALPHA_STEP(1);
+    ALPHA_STEP(2);
+    ALPHA_STEP(3);
+    gv = _mm_load_si128(gPtr);
+    gPtr++;
+    ALPHA_STEP(0);
+    ALPHA_STEP(1);
+    ALPHA_STEP(2);
+    ALPHA_STEP(3);
+    __m128i norm = _mm_shuffle_epi8(alpha_k, shuf_norm); 
+    alpha_k = _mm_sub_epi16(alpha_k, norm);
  }  
+}

-  for (k = 1; k < end + 1; k++) {
-    x = input[k - 1];
-    y = parity[k - 1];
+void srslte_map_gen_gamma(srslte_map_gen_t * h, llr_t *input, llr_t *app, llr_t *parity, uint32_t long_cb) 
+{
+  __m128i res10, res20, res11, res21, res1, res2; 
+  __m128i in, ap, pa, g1, g0;

-    xy = x + y;
+  __m128i *inPtr  = (__m128i*) input;
+  __m128i *appPtr = (__m128i*) app;
+  __m128i *paPtr  = (__m128i*) parity;
+  __m128i *resPtr = (__m128i*) h->branch;
  
-    m_b[0] = old[0];
-    m_b[1] = old[3] + y;
-    m_b[2] = old[4] + y;
-    m_b[3] = old[7];
-    m_b[4] = old[1];
-    m_b[5] = old[2] + y;
-    m_b[6] = old[5] + y;
-    m_b[7] = old[6];
+  __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
+  __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
+  __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
+  __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
  
-    new[0] = old[1] + xy;
-    new[1] = old[2] + x;
-    new[2] = old[5] + x;
-    new[3] = old[6] + xy;
-    new[4] = old[0] + xy;
-    new[5] = old[3] + x;
-    new[6] = old[4] + x;
-    new[7] = old[7] + xy;
+  for (int i=0;i<long_cb/8;i++) {
+    in = _mm_load_si128(inPtr);
+    inPtr++;
+    pa = _mm_load_si128(paPtr);
+    paPtr++;
    
-    for (i = 0; i < 8; i++) {
-      max0[i] = m_b[i] + beta[8 * k + i];
-      max1[i] = new[i] + beta[8 * k + i];
+    if (appPtr) {
+      ap = _mm_load_si128(appPtr);
+      appPtr++;
+      in = _mm_add_epi16(ap, in);
    }
    
-    m1 = max1[0];
-    m0 = max0[0];
+    g1 = _mm_add_epi16(in, pa);
+    g0 = _mm_sub_epi16(in, pa);

-    for (i = 1; i < 8; i++) {
-      if (max1[i] > m1)
-        m1 = max1[i];
-      if (max0[i] > m0)
-        m0 = max0[i];
-    }
+    g1 = _mm_srai_epi16(g1, 1);
+    g0 = _mm_srai_epi16(g0, 1);
+    
+    res10 = _mm_shuffle_epi8(g0, res10_mask);
+    res20 = _mm_shuffle_epi8(g0, res20_mask);
+    res11 = _mm_shuffle_epi8(g1, res11_mask);
+    res21 = _mm_shuffle_epi8(g1, res21_mask);

-    for (i = 0; i < 8; i++) {
-      if (m_b[i] > new[i])
-        new[i] = m_b[i];
-      old[i] = new[i];
+    res1  = _mm_or_si128(res10, res11);
+    res2  = _mm_or_si128(res20, res21);
+
+    _mm_store_si128(resPtr, res1);
+    resPtr++;
+    _mm_store_si128(resPtr, res2);    
+    resPtr++;
  }

-    out = m1 - m0;
-    output[k - 1] = out;
+  for (int i=long_cb;i<long_cb+3;i++) {
+    h->branch[2*i]   = (input[i] - parity[i])/2;
+    h->branch[2*i+1] = (input[i] + parity[i])/2;
  }
 }

+
 int srslte_map_gen_init(srslte_map_gen_t * h, int max_long_cb)
 {
  bzero(h, sizeof(srslte_map_gen_t));
-  h->beta = srslte_vec_malloc(sizeof(srslte_llr_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
-  if (!h->beta) {
+  h->alpha = srslte_vec_malloc(sizeof(llr_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
+  if (!h->alpha) {
+    perror("srslte_vec_malloc");
+    return -1;
+  }
+  h->branch = srslte_vec_malloc(sizeof(llr_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
+  if (!h->branch) {
    perror("srslte_vec_malloc");
    return -1;
  }
@ -177,23 +310,28 @@ int srslte_map_gen_init(srslte_map_gen_t * h, int max_long_cb)

 void srslte_map_gen_free(srslte_map_gen_t * h)
 {
-  if (h->beta) {
-    free(h->beta);
+  if (h->alpha) {
+    free(h->alpha);
+  }
+  if (h->branch) {
+    free(h->branch);
  }
  bzero(h, sizeof(srslte_map_gen_t));
 }

-void srslte_map_gen_dec(srslte_map_gen_t * h, srslte_llr_t * input, srslte_llr_t * parity, srslte_llr_t * output,
+void srslte_map_gen_dec(srslte_map_gen_t * h, llr_t * input, llr_t *app, llr_t * parity, llr_t * output,
                 uint32_t long_cb)
 {
-  uint32_t k;
 
-  h->beta[(long_cb + TAIL) * NUMSTATES] = 0;
-  for (k = 1; k < NUMSTATES; k++)
-    h->beta[(long_cb + TAIL) * NUMSTATES + k] = -INF;
+  // Compute branch metrics
+  srslte_map_gen_gamma(h, input, app, parity, long_cb);
+
+  // Forward recursion
+  srslte_map_gen_alpha(h, long_cb);
+
+  // Backwards recursion + LLR computation
+  srslte_map_gen_beta(h, output, long_cb);
  
-  srslte_map_gen_beta(h, input, parity, long_cb);
-  srslte_map_gen_alpha(h, input, parity, output, long_cb);
 }

 /************************************************
@ -209,28 +347,38 @@ int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb)

  h->max_long_cb = max_long_cb;

-  h->llr1 = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
-  if (!h->llr1) {
+  h->app1 = srslte_vec_malloc(sizeof(llr_t) * len);
+  if (!h->app1) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->app2 = srslte_vec_malloc(sizeof(llr_t) * len);
+  if (!h->app2) {
    perror("srslte_vec_malloc");
    goto clean_and_exit;
  }
-  h->llr2 = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
-  if (!h->llr2) {
+  h->ext1 = srslte_vec_malloc(sizeof(llr_t) * len);
+  if (!h->ext1) {
    perror("srslte_vec_malloc");
    goto clean_and_exit;
  }
-  h->w = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
-  if (!h->w) {
+  h->ext2 = srslte_vec_malloc(sizeof(llr_t) * len);
+  if (!h->ext2) {
    perror("srslte_vec_malloc");
    goto clean_and_exit;
  }
-  h->syst = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
+  h->syst = srslte_vec_malloc(sizeof(llr_t) * len);
  if (!h->syst) {
    perror("srslte_vec_malloc");
    goto clean_and_exit;
  }
-  h->parity = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
-  if (!h->parity) {
+  h->parity0 = srslte_vec_malloc(sizeof(llr_t) * len);
+  if (!h->parity0) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->parity1 = srslte_vec_malloc(sizeof(llr_t) * len);
+  if (!h->parity1) {
    perror("srslte_vec_malloc");
    goto clean_and_exit;
  }
@ -255,20 +403,26 @@ clean_and_exit:if (ret == -1) {

 void srslte_tdec_free(srslte_tdec_t * h)
 {
-  if (h->llr1) {
-    free(h->llr1);
+  if (h->app1) {
+    free(h->app1);
  }
-  if (h->llr2) {
-    free(h->llr2);
+  if (h->app2) {
+    free(h->app2);
  }
-  if (h->w) {
-    free(h->w);
+  if (h->ext1) {
+    free(h->ext1);
+  }
+  if (h->ext2) {
+    free(h->ext2);
  }
  if (h->syst) {
    free(h->syst);
  }
-  if (h->parity) {
-    free(h->parity);
+  if (h->parity0) {
+    free(h->parity0);
+  }
+  if (h->parity1) {
+    free(h->parity1);
  }

  srslte_map_gen_free(&h->dec);
@ -280,48 +434,145 @@ void srslte_tdec_free(srslte_tdec_t * h)
  bzero(h, sizeof(srslte_tdec_t));
 }

-void srslte_tdec_iteration(srslte_tdec_t * h, srslte_llr_t * input, uint32_t long_cb)
-{
+void deinterleave_input(srslte_tdec_t *h, float *input, uint32_t long_cb) {
  uint32_t i;
 
-  if (h->current_cbidx >= 0) {
+  float *inputPtr = input; 
+  __m128 inf0, inf1, inf2, inf3, inf4, inf5;
+  __m128i in0, in1, in2;
+  __m128i s0, s1, s2, s;
+  __m128i p00, p01, p02, p0;
+  __m128i p10, p11, p12, p1;
+  
+  __m128i *sysPtr = (__m128i*) h->syst; 
+  __m128i *pa0Ptr = (__m128i*) h->parity0; 
+  __m128i *pa1Ptr = (__m128i*) h->parity1; 
+  
+  // pick bits 0, 3, 6 from 1st word
+  __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
+  // pick bits 1, 4, 7 from 2st word
+  __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
+  // pick bits 2, 5 from 3rd word
+  __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+
+  // pick bits 1, 4, 7 from 1st word
+  __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
+  // pick bits 2, 5, from 2st word
+  __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
+  // pick bits 0, 3, 6 from 3rd word
+  __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+  
+  // pick bits 2, 5 from 1st word
+  __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
+  // pick bits 0, 3, 6, from 2st word
+  __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
+  // pick bits 1, 4, 7 from 3rd word
+  __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+  
+  __m128 vScalar = _mm_set1_ps(SCALE);
+    
+  // Split systematic and parity bits
+  for (i = 0; i < long_cb/8; i++) {
+        
+    inf0 = _mm_load_ps(inputPtr); inputPtr+=4; 
+    inf1 = _mm_load_ps(inputPtr); inputPtr+=4;   
+    inf2 = _mm_load_ps(inputPtr); inputPtr+=4;
+    inf3 = _mm_load_ps(inputPtr); inputPtr+=4;
+    inf4 = _mm_load_ps(inputPtr); inputPtr+=4;
+    inf5 = _mm_load_ps(inputPtr); inputPtr+=4;
+
+    inf0 = _mm_mul_ps(inf0, vScalar);
+    inf1 = _mm_mul_ps(inf1, vScalar);
+    inf2 = _mm_mul_ps(inf2, vScalar);
+    inf3 = _mm_mul_ps(inf3, vScalar);
+    inf4 = _mm_mul_ps(inf4, vScalar);
+    inf5 = _mm_mul_ps(inf5, vScalar);
    
-    uint32_t *inter = h->interleaver[h->current_cbidx].forward;
-    uint32_t *deinter = h->interleaver[h->current_cbidx].reverse;
+    in0 = _mm_packs_epi32(_mm_cvtps_epi32(inf0), _mm_cvtps_epi32(inf1));
+    in1 = _mm_packs_epi32(_mm_cvtps_epi32(inf2), _mm_cvtps_epi32(inf3));
+    in2 = _mm_packs_epi32(_mm_cvtps_epi32(inf4), _mm_cvtps_epi32(inf5));

-    // Prepare systematic and parity bits for MAP DEC #1
-    for (i = 0; i < long_cb; i++) {
-      h->syst[i] = input[SRSLTE_TCOD_RATE * i] + h->w[i];
-      h->parity[i] = input[SRSLTE_TCOD_RATE * i + 1];
+    /* Deinterleave Systematic bits */
+    s0 = _mm_shuffle_epi8(in0, s0_mask);
+    s1 = _mm_shuffle_epi8(in1, s1_mask);
+    s2 = _mm_shuffle_epi8(in2, s2_mask);    
+    s = _mm_or_si128(s0, s1);
+    s = _mm_or_si128(s, s2);
+
+    _mm_store_si128(sysPtr, s);
+    sysPtr++;
+
+    /* Deinterleave parity 0 bits */
+    p00 = _mm_shuffle_epi8(in0, p00_mask);
+    p01 = _mm_shuffle_epi8(in1, p01_mask);
+    p02 = _mm_shuffle_epi8(in2, p02_mask);    
+    p0 = _mm_or_si128(p00, p01);
+    p0 = _mm_or_si128(p0, p02);
+    
+    _mm_store_si128(pa0Ptr, p0);
+    pa0Ptr++;
+
+    /* Deinterleave parity 1 bits */
+    p10 = _mm_shuffle_epi8(in0, p10_mask);
+    p11 = _mm_shuffle_epi8(in1, p11_mask);
+    p12 = _mm_shuffle_epi8(in2, p12_mask);    
+    p1 = _mm_or_si128(p10, p11);
+    p1 = _mm_or_si128(p1, p12);
+
+    _mm_store_si128(pa1Ptr, p1);
+    pa1Ptr++;    
+
+  }
+  
+  for (i = 0; i < 3; i++) {
+    h->syst[i+long_cb]    = (llr_t) SCALE*input[3*long_cb + 2*i];
+    h->parity0[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 2*i + 1];
  }
-    for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) {
-      h->syst[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb)];
-      h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb) + 1];
+  for (i = 0; i < 3; i++) {
+    h->app2[i+long_cb]    = (llr_t) SCALE*input[3*long_cb + 6 + 2*i];
+    h->parity1[i+long_cb] = (llr_t) SCALE*input[3*long_cb + 6 + 2*i + 1];
  }

-    // Run MAP DEC #1
-    srslte_map_gen_dec(&h->dec, h->syst, h->parity, h->llr1, long_cb);
+}

-    // Prepare systematic and parity bits for MAP DEC #1
-    for (i = 0; i < long_cb; i++) {
-      h->syst[i] = h->llr1[inter[i]]
-        - h->w[inter[i]];
-      h->parity[i] = input[SRSLTE_TCOD_RATE * i + 2];
+void srslte_tdec_iteration(srslte_tdec_t * h, float * input, uint32_t long_cb)
+{
+
+  if (h->current_cbidx >= 0) {
+    uint16_t *inter   = h->interleaver[h->current_cbidx].forward;
+    uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
+    
+    if (h->n_iter == 0) {
+      deinterleave_input(h, input, long_cb);
    }
-    for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) {
-      h->syst[i] =
-        input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + NINPUTS * (i - long_cb)];
-      h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE
-                          + NINPUTS * (i - long_cb) + 1];
+    
+    // Add apriori information to decoder 1 
+    if (h->n_iter > 0) {
+      srslte_vec_sub_sss(h->app1, h->ext1, h->app1, long_cb);
    }
        
    // Run MAP DEC #1
-    srslte_map_gen_dec(&h->dec, h->syst, h->parity, h->llr2, long_cb);
+    if (h->n_iter == 0) {
+      srslte_map_gen_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, long_cb);            
+    } else {
+      srslte_map_gen_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, long_cb);      
+    }

-    // Update a-priori LLR from the last iteration
-    for (i = 0; i < long_cb; i++) {
-      h->w[i] += h->llr2[deinter[i]] - h->llr1[i];
+    // Convert aposteriori information into extrinsic information    
+    if (h->n_iter > 0) {
+      srslte_vec_sub_sss(h->ext1, h->app1, h->ext1, long_cb);
    }
+    
+    // Interleave extrinsic output of DEC1 to form apriori info for decoder 2
+    srslte_vec_lut_sss(h->ext1, inter, h->app2, long_cb);
+
+    // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits
+    srslte_map_gen_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, long_cb);
+
+    // Deinterleaved extrinsic bits become apriori info for decoder 1 
+    srslte_vec_lut_sss(h->ext2, deinter, h->app1, long_cb);
+    
+    h->n_iter++;
  } else {
    fprintf(stderr, "Error CB index not set (call srslte_tdec_reset() first\n");    
  }
@ -334,7 +585,7 @@ int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb)
            h->max_long_cb);
    return -1;
  }
-  memset(h->w, 0, sizeof(srslte_llr_t) * long_cb);
+  h->n_iter = 0; 
  h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
  if (h->current_cbidx < 0) {
    fprintf(stderr, "Invalid CB length %d\n", long_cb);
@ -345,47 +596,59 @@ int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb)

 void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb)
 {
-  uint32_t *deinter = h->interleaver[h->current_cbidx].reverse;
-  uint32_t i;
-  for (i = 0; i < long_cb; i++) {
-    output[i] = (h->llr2[deinter[i]] > 0) ? 1 : 0;    
+  __m128i zero     = _mm_set1_epi16(0);
+  __m128i lsb_mask = _mm_set1_epi16(1);
+  
+  __m128i *appPtr = (__m128i*) h->app1;
+  __m128i *outPtr = (__m128i*) output;
+  __m128i ap, out, out0, out1; 
+  
+  for (uint32_t i = 0; i < long_cb/16; i++) {
+    ap   = _mm_load_si128(appPtr); appPtr++;    
+    out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
+    ap   = _mm_load_si128(appPtr); appPtr++;
+    out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
+    
+    out  = _mm_packs_epi16(out0, out1);
+    _mm_store_si128(outPtr, out);
+    outPtr++;
+  }
+  if (long_cb%16) {
+    for (int i=0;i<8;i++) {
+      output[long_cb-8+i] = h->app1[long_cb-8+i]>0?1:0;
+    }
  }
 }

 void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb)
 {
-  uint32_t i;
  uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
-  uint32_t *deinter = h->interleaver[h->current_cbidx].reverse;
  
  // long_cb is always byte aligned
-  for (i = 0; i < long_cb/8; i++) {
-    uint8_t out0 = h->llr2[deinter[8*i+0]]>0?mask[0]:0;
-    uint8_t out1 = h->llr2[deinter[8*i+1]]>0?mask[1]:0;
-    uint8_t out2 = h->llr2[deinter[8*i+2]]>0?mask[2]:0;
-    uint8_t out3 = h->llr2[deinter[8*i+3]]>0?mask[3]:0;
-    uint8_t out4 = h->llr2[deinter[8*i+4]]>0?mask[4]:0;
-    uint8_t out5 = h->llr2[deinter[8*i+5]]>0?mask[5]:0;
-    uint8_t out6 = h->llr2[deinter[8*i+6]]>0?mask[6]:0;
-    uint8_t out7 = h->llr2[deinter[8*i+7]]>0?mask[7]:0;
+  for (uint32_t i = 0; i < long_cb/8; i++) {
+    uint8_t out0 = h->app1[i+0]>0?mask[0]:0;
+    uint8_t out1 = h->app1[i+1]>0?mask[1]:0;
+    uint8_t out2 = h->app1[i+2]>0?mask[2]:0;
+    uint8_t out3 = h->app1[i+3]>0?mask[3]:0;
+    uint8_t out4 = h->app1[i+4]>0?mask[4]:0;
+    uint8_t out5 = h->app1[i+5]>0?mask[5]:0;
+    uint8_t out6 = h->app1[i+6]>0?mask[6]:0;
+    uint8_t out7 = h->app1[i+7]>0?mask[7]:0;
    
    output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; 
  }
 }

-int srslte_tdec_run_all(srslte_tdec_t * h, srslte_llr_t * input, uint8_t *output,
+int srslte_tdec_run_all(srslte_tdec_t * h, float * input, uint8_t *output,
                  uint32_t nof_iterations, uint32_t long_cb)
 {
-  uint32_t iter = 0;
-
  if (srslte_tdec_reset(h, long_cb)) {
    return SRSLTE_ERROR; 
  }

  do {
    srslte_tdec_iteration(h, input, long_cb);
-    iter++;
-  } while (iter < nof_iterations);
+  } while (h->n_iter < nof_iterations);

  srslte_tdec_decision(h, output, long_cb);
  
--- a/srslte/lib/fec/src/turbodecoder_vl.c
+++ b/srslte/lib/fec/src/turbodecoder_vl.c
@ -0,0 +1,393 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 The srsLTE Developers. See the
+ * COPYRIGHT file at the top-level directory of this distribution.
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <math.h>
+
+#include "srslte/fec/turbodecoder_vl.h"
+#include "srslte/utils/vector.h"
+
+#define NUMSTATES       8
+#define NINPUTS         2
+#define TAIL            3
+#define TOTALTAIL       12
+
+#define INF 9e4
+#define ZERO 9e-4
+
+/************************************************
+ *
+ *  MAP_GEN is the MAX-LOG-MAP generic implementation of the
+ *  Decoder
+ *
+ ************************************************/
+static void map_gen_beta(srslte_map_gen_vl_t * s, srslte_llr_t * input, srslte_llr_t * parity,
+                  uint32_t long_cb)
+{
+  srslte_llr_t m_b[8], new[8], old[8];
+  srslte_llr_t x, y, xy;
+  int k;
+  uint32_t end = long_cb + SRSLTE_TCOD_RATE;
+  srslte_llr_t *beta = s->beta;
+  uint32_t i;
+
+  for (i = 0; i < 8; i++) {
+    old[i] = beta[8 * (end) + i];
+  }
+
+  for (k = end - 1; k >= 0; k--) {
+    x = input[k];
+    y = parity[k];
+
+    xy = x + y;
+
+    m_b[0] = old[4] + xy;
+    m_b[1] = old[4];
+    m_b[2] = old[5] + y;
+    m_b[3] = old[5] + x;
+    m_b[4] = old[6] + x;
+    m_b[5] = old[6] + y;
+    m_b[6] = old[7];
+    m_b[7] = old[7] + xy;
+
+    new[0] = old[0];
+    new[1] = old[0] + xy;
+    new[2] = old[1] + x;
+    new[3] = old[1] + y;
+    new[4] = old[2] + y;
+    new[5] = old[2] + x;
+    new[6] = old[3] + xy;
+    new[7] = old[3];
+
+    for (i = 0; i < 8; i++) {
+      if (m_b[i] > new[i])
+        new[i] = m_b[i];
+      old[i] = new[i];
+      beta[8 * k + i] = old[i];
+    }
+  }
+}
+
+static void map_gen_alpha(srslte_map_gen_vl_t * s, srslte_llr_t * input, srslte_llr_t * parity, srslte_llr_t * output,
+                   uint32_t long_cb)
+{
+  srslte_llr_t m_b[8], new[8], old[8], max1[8], max0[8];
+  srslte_llr_t m1, m0;
+  srslte_llr_t x, y, xy;
+  srslte_llr_t out;
+  uint32_t k;
+  uint32_t end = long_cb;
+  srslte_llr_t *beta = s->beta;
+  uint32_t i;
+
+  old[0] = 0;
+  for (i = 1; i < 8; i++) {
+    old[i] = -INF;
+  }
+
+  for (k = 1; k < end + 1; k++) {
+    x = input[k - 1];
+    y = parity[k - 1];
+
+    xy = x + y;
+
+    m_b[0] = old[0];
+    m_b[1] = old[3] + y;
+    m_b[2] = old[4] + y;
+    m_b[3] = old[7];
+    m_b[4] = old[1];
+    m_b[5] = old[2] + y;
+    m_b[6] = old[5] + y;
+    m_b[7] = old[6];
+
+    new[0] = old[1] + xy;
+    new[1] = old[2] + x;
+    new[2] = old[5] + x;
+    new[3] = old[6] + xy;
+    new[4] = old[0] + xy;
+    new[5] = old[3] + x;
+    new[6] = old[4] + x;
+    new[7] = old[7] + xy;
+
+    for (i = 0; i < 8; i++) {
+      max0[i] = m_b[i] + beta[8 * k + i];
+      max1[i] = new[i] + beta[8 * k + i];
+    }
+
+    m1 = max1[0];
+    m0 = max0[0];
+
+    for (i = 1; i < 8; i++) {
+      if (max1[i] > m1)
+        m1 = max1[i];
+      if (max0[i] > m0)
+        m0 = max0[i];
+    }
+
+    for (i = 0; i < 8; i++) {
+      if (m_b[i] > new[i])
+        new[i] = m_b[i];
+      old[i] = new[i];
+    }
+
+    out = m1 - m0;
+    output[k - 1] = out;
+  }
+}
+
+static int map_gen_init(srslte_map_gen_vl_t * h, int max_long_cb)
+{
+  bzero(h, sizeof(srslte_map_gen_vl_t));
+  h->beta = srslte_vec_malloc(sizeof(srslte_llr_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES);
+  if (!h->beta) {
+    perror("srslte_vec_malloc");
+    return -1;
+  }
+  h->max_long_cb = max_long_cb;
+  return 0;
+}
+
+static void map_gen_free(srslte_map_gen_vl_t * h)
+{
+  if (h->beta) {
+    free(h->beta);
+  }
+  bzero(h, sizeof(srslte_map_gen_vl_t));
+}
+
+static void map_gen_dec(srslte_map_gen_vl_t * h, srslte_llr_t * input, srslte_llr_t * parity, srslte_llr_t * output,
+                 uint32_t long_cb)
+{
+  uint32_t k;
+
+  h->beta[(long_cb + TAIL) * NUMSTATES] = 0;
+  for (k = 1; k < NUMSTATES; k++)
+    h->beta[(long_cb + TAIL) * NUMSTATES + k] = -INF;
+
+  map_gen_beta(h, input, parity, long_cb);
+  map_gen_alpha(h, input, parity, output, long_cb);
+}
+
+/************************************************
+ *
+ *  TURBO DECODER INTERFACE
+ *
+ ************************************************/
+int srslte_tdec_vl_init(srslte_tdec_vl_t * h, uint32_t max_long_cb)
+{
+  int ret = -1;
+  bzero(h, sizeof(srslte_tdec_vl_t));
+  uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
+
+  h->max_long_cb = max_long_cb;
+
+  h->llr1 = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
+  if (!h->llr1) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->llr2 = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
+  if (!h->llr2) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->w = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
+  if (!h->w) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->syst = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
+  if (!h->syst) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+  h->parity = srslte_vec_malloc(sizeof(srslte_llr_t) * len);
+  if (!h->parity) {
+    perror("srslte_vec_malloc");
+    goto clean_and_exit;
+  }
+
+  if (map_gen_init(&h->dec, h->max_long_cb)) {
+    goto clean_and_exit;
+  }
+
+  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
+    if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
+      goto clean_and_exit;
+    }
+    srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
+  }
+  h->current_cbidx = -1; 
+  ret = 0;
+clean_and_exit:if (ret == -1) {
+    srslte_tdec_vl_free(h);
+  }
+  return ret;
+}
+
+void srslte_tdec_vl_free(srslte_tdec_vl_t * h)
+{
+  if (h->llr1) {
+    free(h->llr1);
+  }
+  if (h->llr2) {
+    free(h->llr2);
+  }
+  if (h->w) {
+    free(h->w);
+  }
+  if (h->syst) {
+    free(h->syst);
+  }
+  if (h->parity) {
+    free(h->parity);
+  }
+
+  map_gen_free(&h->dec);
+
+  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
+    srslte_tc_interl_free(&h->interleaver[i]);    
+  }
+
+  bzero(h, sizeof(srslte_tdec_vl_t));
+}
+
+void srslte_tdec_vl_iteration(srslte_tdec_vl_t * h, srslte_llr_t * input, uint32_t long_cb)
+{
+  uint32_t i;
+
+  if (h->current_cbidx >= 0) {
+
+    uint16_t *inter = h->interleaver[h->current_cbidx].forward;
+    uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
+    
+    // Prepare systematic and parity bits for MAP DEC #1
+    for (i = 0; i < long_cb; i++) {
+      h->syst[i] = input[SRSLTE_TCOD_RATE * i] + h->w[i];
+      h->parity[i] = input[SRSLTE_TCOD_RATE * i + 1];
+    }
+    for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) {
+      h->syst[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb)];
+      h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * (i - long_cb) + 1];
+    }
+
+    // Run MAP DEC #1
+    map_gen_dec(&h->dec, h->syst, h->parity, h->llr1, long_cb);
+
+    // Prepare systematic and parity bits for MAP DEC #1
+    for (i = 0; i < long_cb; i++) {
+      h->syst[i] = h->llr1[inter[i]]
+        - h->w[inter[i]];
+      h->parity[i] = input[SRSLTE_TCOD_RATE * i + 2];
+    }
+    for (i = long_cb; i < long_cb + SRSLTE_TCOD_RATE; i++) {
+      h->syst[i] =
+        input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE + NINPUTS * (i - long_cb)];
+      h->parity[i] = input[SRSLTE_TCOD_RATE * long_cb + NINPUTS * SRSLTE_TCOD_RATE
+                          + NINPUTS * (i - long_cb) + 1];
+    }
+
+    // Run MAP DEC #2
+    map_gen_dec(&h->dec, h->syst, h->parity, h->llr2, long_cb);
+
+    // Update a-priori LLR from the last iteration
+    for (i = 0; i < long_cb; i++) {
+      h->w[i] += h->llr2[deinter[i]] - h->llr1[i];
+    }
+  } else {
+    fprintf(stderr, "Error CB index not set (call srslte_tdec_vl_reset() first\n");    
+  }
+}
+
+int srslte_tdec_vl_reset(srslte_tdec_vl_t * h, uint32_t long_cb)
+{
+  if (long_cb > h->max_long_cb) {
+    fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
+            h->max_long_cb);
+    return -1;
+  }
+  memset(h->w, 0, sizeof(srslte_llr_t) * long_cb);
+  h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
+  if (h->current_cbidx < 0) {
+    fprintf(stderr, "Invalid CB length %d\n", long_cb);
+    return -1; 
+  }
+  return 0;
+}
+
+void srslte_tdec_vl_decision(srslte_tdec_vl_t * h, uint8_t *output, uint32_t long_cb)
+{
+  uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
+  uint32_t i;
+  for (i = 0; i < long_cb; i++) {
+    output[i] = (h->llr2[deinter[i]] > 0) ? 1 : 0;    
+  }
+}
+
+void srslte_tdec_vl_decision_byte(srslte_tdec_vl_t * h, uint8_t *output, uint32_t long_cb)
+{
+  uint32_t i;
+  uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
+  uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
+  
+  // long_cb is always byte aligned
+  for (i = 0; i < long_cb/8; i++) {
+    uint8_t out0 = h->llr2[deinter[8*i+0]]>0?mask[0]:0;
+    uint8_t out1 = h->llr2[deinter[8*i+1]]>0?mask[1]:0;
+    uint8_t out2 = h->llr2[deinter[8*i+2]]>0?mask[2]:0;
+    uint8_t out3 = h->llr2[deinter[8*i+3]]>0?mask[3]:0;
+    uint8_t out4 = h->llr2[deinter[8*i+4]]>0?mask[4]:0;
+    uint8_t out5 = h->llr2[deinter[8*i+5]]>0?mask[5]:0;
+    uint8_t out6 = h->llr2[deinter[8*i+6]]>0?mask[6]:0;
+    uint8_t out7 = h->llr2[deinter[8*i+7]]>0?mask[7]:0;
+    
+    output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7; 
+  }
+}
+
+int srslte_tdec_vl_run_all(srslte_tdec_vl_t * h, srslte_llr_t * input, uint8_t *output,
+                  uint32_t nof_iterations, uint32_t long_cb)
+{
+  uint32_t iter = 0;
+
+  if (srslte_tdec_vl_reset(h, long_cb)) {
+    return SRSLTE_ERROR; 
+  }
+
+  do {
+    srslte_tdec_vl_iteration(h, input, long_cb);
+    iter++;
+  } while (iter < nof_iterations);
+
+  srslte_tdec_vl_decision(h, output, long_cb);
+  
+  return SRSLTE_SUCCESS;
+}
--- a/srslte/lib/fec/test/turbodecoder_test.c
+++ b/srslte/lib/fec/test/turbodecoder_test.c
@ -36,6 +36,7 @@
 #include <sys/time.h>
 #include <time.h>
 #include "srslte/srslte.h"
+#include "srslte/fec/turbodecoder_vl.h"

 #include "turbodecoder_test.h"

@ -46,14 +47,15 @@ float ebno_db = 100.0;
 uint32_t seed = 0;
 int K = -1;

-#define MAX_ITERATIONS  4
+#define MAX_ITERATIONS  10
 int nof_iterations = MAX_ITERATIONS;
 int test_known_data = 0;
 int test_errors = 0;
+int nof_repetitions = 1; 

-#define SNR_POINTS      8
-#define SNR_MIN         0.0
-#define SNR_MAX         4.0
+#define SNR_POINTS      4
+#define SNR_MIN         1.0
+#define SNR_MAX         8.0

 void usage(char *prog) {
  printf("Usage: %s [nlesv]\n", prog);
@ -61,6 +63,7 @@ void usage(char *prog) {
      "\t-k Test with known data (ignores frame_length) [Default disabled]\n");
  printf("\t-i nof_iterations [Default %d]\n", nof_iterations);
  printf("\t-n nof_frames [Default %d]\n", nof_frames);
+  printf("\t-N nof_repetitions [Default %d]\n", nof_repetitions);
  printf("\t-l frame_length [Default %d]\n", frame_length);
  printf("\t-e ebno in dB [Default scan]\n");
  printf("\t-t test: check errors on exit [Default disabled]\n");
@ -69,11 +72,14 @@ void usage(char *prog) {

 void parse_args(int argc, char **argv) {
  int opt;
-  while ((opt = getopt(argc, argv, "inlstvekt")) != -1) {
+  while ((opt = getopt(argc, argv, "inNlstvekt")) != -1) {
    switch (opt) {
    case 'n':
      nof_frames = atoi(argv[optind]);
      break;
+    case 'N':
+      nof_repetitions = atoi(argv[optind]);
+      break;
    case 'k':
      test_known_data = 1;
      break;
@ -102,29 +108,6 @@ void parse_args(int argc, char **argv) {
  }
 }

-void output_matlab(float ber[MAX_ITERATIONS][SNR_POINTS], int snr_points) {
-  int i, j;
-  FILE *f = fopen("turbocoder_snr.m", "w");
-  if (!f) {
-    perror("fopen");
-    exit(-1);
-  }
-  fprintf(f, "ber=[");
-  for (j = 0; j < MAX_ITERATIONS; j++) {
-    for (i = 0; i < snr_points; i++) {
-      fprintf(f, "%g ", ber[j][i]);
-    }
-    fprintf(f, ";\n");
-  }
-  fprintf(f, "];\n");
-  fprintf(f, "snr=linspace(%g,%g-%g/%d,%d);\n", SNR_MIN, SNR_MAX, SNR_MAX,
-      snr_points, snr_points);
-  fprintf(f, "semilogy(snr,ber,snr,0.5*erfc(sqrt(10.^(snr/10))));\n");
-  fprintf(f,
-      "legend('1 iter','2 iter', '3 iter', '4 iter', 'theory-uncoded');");
-  fprintf(f, "grid on;\n");
-  fclose(f);
-}

 int main(int argc, char **argv) {
  uint32_t frame_cnt;
@ -134,12 +117,13 @@ int main(int argc, char **argv) {
  uint32_t i, j;
  float var[SNR_POINTS];
  uint32_t snr_points;
-  float ber[MAX_ITERATIONS][SNR_POINTS];
-  uint32_t errors[100];
+  uint32_t errors;
+  uint32_t errors_vl;
  uint32_t coded_length;
  struct timeval tdata[3];
-  float mean_usec;
+  float mean_usec, mean_usec_vl;
  srslte_tdec_t tdec;
+  srslte_tdec_vl_t tdec_vl;
  srslte_tcod_t tcod;
  
  parse_args(argc, argv);
@ -200,6 +184,11 @@ int main(int argc, char **argv) {
    exit(-1);
  }

+  if (srslte_tdec_vl_init(&tdec_vl, frame_length)) {
+    fprintf(stderr, "Error initiating Turbo decoder\n");
+    exit(-1);
+  }
+
  float ebno_inc, esno_db;
  ebno_inc = (SNR_MAX - SNR_MIN) / SNR_POINTS;
  if (ebno_db == 100.0) {
@ -215,11 +204,13 @@ int main(int argc, char **argv) {
    snr_points = 1;
  }
  for (i = 0; i < snr_points; i++) {
+
    mean_usec = 0;
+    mean_usec_vl = 0;
+    errors = 0; 
+    errors_vl = 0; 
    frame_cnt = 0;
-    bzero(errors, sizeof(int) * MAX_ITERATIONS);
    while (frame_cnt < nof_frames) {
-
      /* generate data_tx */
      for (j = 0; j < frame_length; j++) {
        if (test_known_data) {
@ -239,13 +230,14 @@ int main(int argc, char **argv) {
      }

      for (j = 0; j < coded_length; j++) {
-        llr[j] = symbols[j] ? sqrt(2) : -sqrt(2);
+        llr[j] = symbols[j] ? 1 : -1;
      }

      srslte_ch_awgn_f(llr, llr, var[i], coded_length);
      
      /* decoder */
      srslte_tdec_reset(&tdec, frame_length);
+      srslte_tdec_vl_reset(&tdec_vl, frame_length);

      uint32_t t;
      if (nof_iterations == -1) {
@ -253,69 +245,51 @@ int main(int argc, char **argv) {
      } else {
        t = nof_iterations;
      }
-      for (j = 0; j < t; j++) {

-        if (!j)
-          gettimeofday(&tdata[1], NULL); // Only measure 1 iteration
-        srslte_tdec_iteration(&tdec, llr, frame_length);
-        srslte_tdec_decision(&tdec, data_rx, frame_length);
-        if (!j)
+      gettimeofday(&tdata[1], NULL); 
+      for (int k=0;k<nof_repetitions;k++) {     
+        srslte_tdec_run_all(&tdec, llr, data_rx, t, frame_length);        
+      }
      gettimeofday(&tdata[2], NULL);
-        if (!j)
      get_time_interval(tdata);
-        if (!j)
-          mean_usec = (float) mean_usec * 0.9 + (float) tdata[0].tv_usec * 0.1;
+      mean_usec = (float) mean_usec * 0.9 + (float) (tdata[0].tv_usec/nof_repetitions) * 0.1;
      
-        /* check errors */
-        errors[j] += srslte_bit_diff(data_tx, data_rx, frame_length);
-        if (j < MAX_ITERATIONS) {
-          ber[j][i] = (float) errors[j] / (frame_cnt * frame_length);
-        }
+      errors += srslte_bit_diff(data_tx, data_rx, frame_length);
+      
+      gettimeofday(&tdata[1], NULL); 
+      for (int k=0;k<nof_repetitions;k++) {     
+        srslte_tdec_vl_run_all(&tdec_vl, llr, data_rx, t, frame_length);
      }
+      gettimeofday(&tdata[2], NULL);
+      get_time_interval(tdata);
+      mean_usec_vl = (float) mean_usec_vl * 0.9 + (float) (tdata[0].tv_usec/nof_repetitions) * 0.1;
+
+      /* check errors */
+      errors_vl += srslte_bit_diff(data_tx, data_rx, frame_length);
+      
      frame_cnt++;
-      printf("Eb/No: %3.2f %10d/%d   ",
-      SNR_MIN + i * ebno_inc, frame_cnt, nof_frames);
-      printf("BER: %.2e  ", (float) errors[j - 1] / (frame_cnt * frame_length));
-      printf("%3.1f Mbps (%6.2f usec)", (float) frame_length / mean_usec,
-          mean_usec);
+      printf("Eb/No: %2.2f %10d/%d   ", SNR_MIN + i * ebno_inc, frame_cnt, nof_frames);
+      printf("BER: %.2e  ", (float) errors / (frame_cnt * frame_length));
+      printf("BER_vl: %.2e  ", (float) errors_vl / (frame_cnt * frame_length));
+      printf("%3.1f Mbps (%6.2f usec) -- vl: ", (float) frame_length / mean_usec, mean_usec);
+      printf("%3.1f Mbps (%6.2f usec)", (float) frame_length / mean_usec_vl, mean_usec_vl);
      printf("\r");

    }    
    printf("\n");
+  }

+  printf("\n");
  if (snr_points == 1) {
-      if (test_known_data && seed == KNOWN_DATA_SEED
-          && ebno_db == KNOWN_DATA_EBNO && frame_cnt == KNOWN_DATA_NFRAMES) {
-        for (j = 0; j < MAX_ITERATIONS; j++) {
-          if (errors[j] > known_data_errors[j]) {
-            fprintf(stderr, "Expected %d errors but got %d\n",
-                known_data_errors[j], errors[j]);
-            exit(-1);
-          } else {
-            printf("Iter %d ok\n", j + 1);
-          }
-        }
-      } else {
-        for (j = 0; j < MAX_ITERATIONS; j++) {
-          printf("BER: %g\t%u errors\n",
-              (float) errors[j] / (frame_cnt * frame_length), errors[j]);
-          if (test_errors) {
-            if (errors[j]
-                > get_expected_errors(frame_cnt, seed, j + 1, frame_length,
-                    ebno_db)) {
-              fprintf(stderr, "Expected %d errors but got %d\n",
-                  get_expected_errors(frame_cnt, seed, j + 1, frame_length,
-                      ebno_db), errors[j]);
-              exit(-1);
-            } else {
-              printf("Iter %d ok\n", j + 1);
-            }
-          }
-        }
+    if (errors) {
+      printf("%d Errors\n", errors);
    }
+    if (errors_vl) {
+      printf("%d Errors in VL\n", errors_vl);
    }
  }    

+
  free(data_tx);
  free(symbols);
  free(llr);
@ -326,7 +300,6 @@ int main(int argc, char **argv) {
  srslte_tcod_free(&tcod);

  printf("\n");
-  output_matlab(ber, snr_points);
  printf("Done\n");
  exit(0);
 }
--- a/srslte/lib/phch/src/sch.c
+++ b/srslte/lib/phch/src/sch.c
@ -441,6 +441,7 @@ static int decode_tb(srslte_sch_t *q,
          early_stop = true;           
        }
        
+        
      } while (q->nof_iterations < SRSLTE_PDSCH_MAX_TDEC_ITERS && !early_stop);
      q->average_nof_iterations = SRSLTE_VEC_EMA((float) q->nof_iterations, q->average_nof_iterations, 0.2);

--- a/srslte/lib/phch/test/pdsch_test.c
+++ b/srslte/lib/phch/test/pdsch_test.c
@ -228,13 +228,11 @@ int main(int argc, char **argv) {
    int r = srslte_pdsch_decode(&pdsch, &pdsch_cfg, &softbuffer_rx, slot_symbols[0], ce, 0, data);
    gettimeofday(&t[2], NULL);
    get_time_interval(t);
+    printf("DECODED %d in %d:%d (%.2f Mbps)\n", r?"Error":"OK",
+             (int) t[0].tv_sec, (int) t[0].tv_usec, (float) grant.mcs.tbs/t[0].tv_usec);                
    if (r) {
-      printf("Error decoding TBS: %d\n", grant.mcs.tbs);
      ret = -1;
      goto quit;
-    } else {
-      printf("DECODED OK in %d:%d (%.2f Mbps)\n", 
-             (int) t[0].tv_sec, (int) t[0].tv_usec, (float) grant.mcs.tbs/t[0].tv_usec);                
    } 
  } 
  ret = 0;
--- a/srslte/lib/utils/src/vector.c
+++ b/srslte/lib/utils/src/vector.c
@ -33,8 +33,11 @@
 #include <string.h>

 #include "srslte/utils/vector.h"
+#include "srslte/utils/vector_simd.h"
 #include "srslte/utils/bit.h"

+#define HAVE_VECTOR_SIMD
+
 #ifdef HAVE_VOLK
 #include "volk/volk.h"
 #endif
@ -102,6 +105,17 @@ void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
 #endif 
 }

+void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
+#ifndef HAVE_VECTOR_SIMD
+  int i;
+  for (i=0;i<len;i++) {
+    z[i] = x[i]-y[i];
+  }
+#else
+  srslte_vec_sub_sss_simd(x, y, z, len);
+#endif
+}
+
 void srslte_vec_sub_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
  return srslte_vec_sub_fff((float*) x,(float*) y,(float*) z, 2*len);
 }
@ -117,6 +131,17 @@ void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len) {
 #endif
 }

+void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len) {
+#ifndef HAVE_VECTOR_SIMD
+  int i;
+  for (i=0;i<len;i++) {
+    z[i] = x[i]+y[i];
+  }
+#else
+  srslte_vec_sum_sss_simd(x, y, z, len);
+#endif
+}
+
 void srslte_vec_sum_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len) {
  srslte_vec_sum_fff((float*) x,(float*) y,(float*) z,2*len);
 }
@ -160,6 +185,25 @@ void srslte_vec_sc_prod_fff(float *x, float h, float *z, uint32_t len) {
 #endif
 }

+void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len) {
+  int i;
+  for (i=0;i<len;i++) {
+    z[i] = x[i]*h;
+  }
+}
+
+void srslte_vec_sc_div2_sss(short *x, int n_rightshift, short *z, uint32_t len) {
+#ifndef HAVE_VECTOR_SIMD
+  int i;
+  int pow2_div = 1<<n_rightshift;
+  for (i=0;i<len;i++) {
+    z[i] = x[i]/pow2_div;
+  }
+#else
+  srslte_vec_sc_div2_sss_simd(x, n_rightshift, z, len);
+#endif
+}
+
 // TODO: Improve this implementation
 void srslte_vec_norm_cfc(cf_t *x, float amplitude, cf_t *y, uint32_t len) {
  // We should use fabs() here but is statistically should be similar
@ -207,6 +251,18 @@ void srslte_vec_convert_fi(float *x, int16_t *z, float scale, uint32_t len) {
 #endif
 }

+void srslte_vec_lut_fuf(float *x, uint32_t *lut, float *y, uint32_t len) {
+  for (int i=0;i<len;i++) {
+    y[i] = x[lut[i]];
+  }
+}
+
+void srslte_vec_lut_sss(short *x, unsigned short *lut, short *y, uint32_t len) {
+  for (int i=0;i<len;i++) {
+    y[i] = x[lut[i]];
+  }
+}
+
 void srslte_vec_interleave_cf(float *real, float *imag, cf_t *x, uint32_t len) {
 #ifdef HAVE_VOLK_INTERLEAVE_FUNCTION
  volk_32f_x2_interleave_32fc(x, real, imag, len);
@ -316,6 +372,15 @@ void srslte_vec_fprint_i(FILE *stream, int *x, uint32_t len) {
  fprintf(stream, "];\n");
 }

+void srslte_vec_fprint_s(FILE *stream, short *x, uint32_t len) {
+  int i;
+  fprintf(stream, "[");
+  for (i=0;i<len;i++) {
+    fprintf(stream, "%d, ", x[i]);
+  }
+  fprintf(stream, "];\n");
+}
+
 void srslte_vec_fprint_hex(FILE *stream, uint8_t *x, uint32_t len) {
  uint32_t i, nbytes; 
  uint8_t byte;
--- a/srslte/lib/utils/src/vector_simd.c
+++ b/srslte/lib/utils/src/vector_simd.c
@ -0,0 +1,137 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 The srsLTE Developers. See the
+ * COPYRIGHT file at the top-level directory of this distribution.
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+
+#include <float.h>
+#include <complex.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "srslte/utils/vector_simd.h"
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#include <xmmintrin.h>
+
+void print128_num(__m128i var)
+{
+    int16_t *val = (int16_t*) &var;//can also use uint32_t instead of 16_t
+    printf("Numerical: %d %d %d %d %d %d %d %d \n", 
+           val[0], val[1], val[2], val[3], val[4], val[5], 
+           val[6], val[7]);
+}
+
+void srslte_vec_sum_sss_simd(short *x, short *y, short *z, uint32_t len)
+{
+  unsigned int number = 0;
+  const unsigned int points = len / 8;
+
+  const __m128i* xPtr = (const __m128i*) x;
+  const __m128i* yPtr = (const __m128i*) y;
+  __m128i* zPtr = (__m128i*) z;
+
+  __m128i xVal, yVal, zVal;
+  for(;number < points; number++){
+
+    xVal = _mm_load_si128(xPtr);
+    yVal = _mm_load_si128(yPtr);
+
+    zVal = _mm_add_epi16(xVal, yVal);
+
+    _mm_store_si128(zPtr, zVal); 
+
+    xPtr ++;
+    yPtr ++;
+    zPtr ++;
+  }
+
+  number = points * 8;
+  for(;number < len; number++){
+    z[number] = x[number] + y[number];
+  }
+}
+
+void srslte_vec_sub_sss_simd(short *x, short *y, short *z, uint32_t len)
+{
+  unsigned int number = 0;
+  const unsigned int points = len / 8;
+
+  const __m128i* xPtr = (const __m128i*) x;
+  const __m128i* yPtr = (const __m128i*) y;
+  __m128i* zPtr = (__m128i*) z;
+
+  __m128i xVal, yVal, zVal;
+  for(;number < points; number++){
+
+    xVal = _mm_load_si128(xPtr);
+    yVal = _mm_load_si128(yPtr);
+
+    zVal = _mm_sub_epi16(xVal, yVal);
+
+    _mm_store_si128(zPtr, zVal); 
+
+    xPtr ++;
+    yPtr ++;
+    zPtr ++;
+  }
+
+  number = points * 8;
+  for(;number < len; number++){
+    z[number] = x[number] - y[number];
+  }
+}
+
+void srslte_vec_sc_div2_sss_simd(short *x, int k, short *z, uint32_t len)
+{
+  unsigned int number = 0;
+  const unsigned int points = len / 8;
+
+  const __m128i* xPtr = (const __m128i*) x;
+  __m128i* zPtr = (__m128i*) z;
+
+  __m128i xVal, zVal;
+  for(;number < points; number++){
+
+    xVal = _mm_load_si128(xPtr);
+    
+    zVal = _mm_srai_epi16(xVal, k);                 
+      
+    _mm_store_si128(zPtr, zVal); 
+
+    xPtr ++;
+    zPtr ++;
+  }
+
+  number = points * 8;
+  short divn = (1<<k);
+  for(;number < len; number++){
+    z[number] = x[number] / divn;
+  }
+}
+