avx turbo decoder working in tests

8 years ago · c1ef9da32a
parent 79e9bccbd8
commit c1ef9da32a
6 changed files with 1121 additions and 38 deletions
--- a/lib/include/srslte/phy/fec/turbodecoder_simd.h
+++ b/lib/include/srslte/phy/fec/turbodecoder_simd.h
@ -0,0 +1,135 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+/**********************************************************************************************
+ *  File:         turbodecoder.h
+ *
+ *  Description:  Turbo Decoder.
+ *                Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
+ *                encoders and one turbo code internal interleaver. The coding rate of turbo
+ *                encoder is 1/3.
+ *                MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
+ *
+ *  Reference:    3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
+ *********************************************************************************************/
+
+#ifndef TURBODECODER_SSE_
+#define TURBODECODER_SSE_
+
+#include "srslte/config.h"
+#include "srslte/phy/fec/tc_interl.h"
+#include "srslte/phy/fec/cbsegm.h"
+
+//#define ENABLE_SIMD_INTER
+
+// The constant SRSLTE_TDEC_NPAR defines the maximum number of parallel CB supported by all SIMD decoders 
+#ifdef ENABLE_SIMD_INTER
+  #include "srslte/phy/fec/turbodecoder_simd_inter.h"
+  #if LV_HAVE_AVX2
+    #define SRSLTE_TDEC_NPAR_INTRA 2
+  #else
+    #define SRSLTE_TDEC_NPAR_INTRA 1
+  #endif
+#else
+  #if LV_HAVE_AVX2
+    #define SRSLTE_TDEC_NPAR 2
+  #else
+    #define SRSLTE_TDEC_NPAR 1
+  #endif
+#endif
+
+#define SRSLTE_TCOD_RATE 3
+#define SRSLTE_TCOD_TOTALTAIL 12
+
+#define SRSLTE_TCOD_MAX_LEN_CB     6144
+#define SRSLTE_TCOD_MAX_LEN_CODED  (SRSLTE_TCOD_RATE*SRSLTE_TCOD_MAX_LEN_CB+SRSLTE_TCOD_TOTALTAIL)
+
+typedef struct SRSLTE_API {
+  uint32_t max_long_cb;
+  uint32_t max_par_cb; 
+  int16_t *alpha;
+  int16_t *branch;
+} map_gen_t;
+
+typedef struct SRSLTE_API {
+  uint32_t max_long_cb;
+  uint32_t max_par_cb; 
+  
+  map_gen_t dec;
+
+  int16_t *app1[SRSLTE_TDEC_NPAR];
+  int16_t *app2[SRSLTE_TDEC_NPAR];
+  int16_t *ext1[SRSLTE_TDEC_NPAR];
+  int16_t *ext2[SRSLTE_TDEC_NPAR];
+  int16_t *syst[SRSLTE_TDEC_NPAR];
+  int16_t *parity0[SRSLTE_TDEC_NPAR];
+  int16_t *parity1[SRSLTE_TDEC_NPAR];
+  
+  int cb_mask; 
+  int current_cbidx; 
+  srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
+  int n_iter[SRSLTE_TDEC_NPAR];
+} srslte_tdec_simd_t;
+
+SRSLTE_API int srslte_tdec_simd_init(srslte_tdec_simd_t * h, 
+                                     uint32_t max_par_cb, 
+                                     uint32_t max_long_cb);
+
+SRSLTE_API void srslte_tdec_simd_free(srslte_tdec_simd_t * h);
+
+SRSLTE_API int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, 
+                                      uint32_t long_cb);
+
+SRSLTE_API int srslte_tdec_simd_get_nof_iterations_cb(srslte_tdec_simd_t * h, 
+                                                      uint32_t cb_idx);
+
+SRSLTE_API int srslte_tdec_simd_reset_cb(srslte_tdec_simd_t * h, 
+                                         uint32_t cb_idx);
+
+SRSLTE_API void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, 
+                                           int16_t * input[SRSLTE_TDEC_NPAR], 
+                                           uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, 
+                                          uint8_t *output[SRSLTE_TDEC_NPAR], 
+                                          uint32_t long_cb);
+
+SRSLTE_API void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, 
+                                               uint8_t *output[SRSLTE_TDEC_NPAR], 
+                                               uint32_t long_cb); 
+
+SRSLTE_API void srslte_tdec_simd_decision_byte_cb(srslte_tdec_simd_t * h, 
+                                                  uint8_t *output, 
+                                                  uint32_t cbidx, 
+                                                  uint32_t long_cb); 
+
+SRSLTE_API int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, 
+                                        int16_t * input[SRSLTE_TDEC_NPAR], 
+                                        uint8_t *output[SRSLTE_TDEC_NPAR],
+                                        uint32_t nof_iterations, 
+                                        uint32_t long_cb);
+
+#endif
--- a/lib/src/phy/fec/test/turbodecoder_test.c
+++ b/lib/src/phy/fec/test/turbodecoder_test.c
@ -46,6 +46,7 @@ uint32_t seed = 0;
 int K = -1;

 #define MAX_ITERATIONS  10
+int nof_cb = 1; 
 int nof_iterations = MAX_ITERATIONS;
 int test_known_data = 0;
 int test_errors = 0;
@ -59,6 +60,7 @@ void usage(char *prog) {
  printf("Usage: %s [nlesv]\n", prog);
  printf(
      "\t-k Test with known data (ignores frame_length) [Default disabled]\n");
+  printf("\t-c nof_cb in parallel [Default %d]\n", nof_cb);
  printf("\t-i nof_iterations [Default %d]\n", nof_iterations);
  printf("\t-n nof_frames [Default %d]\n", nof_frames);
  printf("\t-N nof_repetitions [Default %d]\n", nof_repetitions);
@ -70,8 +72,11 @@ void usage(char *prog) {

 void parse_args(int argc, char **argv) {
  int opt;
-  while ((opt = getopt(argc, argv, "inNlstvekt")) != -1) {
+  while ((opt = getopt(argc, argv, "cinNlstvekt")) != -1) {
    switch (opt) {
+    case 'c':
+      nof_cb = atoi(argv[optind]);
+      break;
    case 'n':
      nof_frames = atoi(argv[optind]);
      break;
@ -112,7 +117,7 @@ int main(int argc, char **argv) {
  float *llr;
  short *llr_s;
  uint8_t *llr_c;
-  uint8_t *data_tx, *data_rx, *data_rx_bytes, *symbols;
+  uint8_t *data_tx, *data_rx, *data_rx_bytes[SRSLTE_TDEC_NPAR], *symbols;
  uint32_t i, j;
  float var[SNR_POINTS];
  uint32_t snr_points;
@ -154,10 +159,12 @@ int main(int argc, char **argv) {
    perror("malloc");
    exit(-1);
  }
-  data_rx_bytes = srslte_vec_malloc(frame_length * sizeof(uint8_t));
-  if (!data_rx_bytes) {
-    perror("malloc");
-    exit(-1);
+  for (int cb=0;cb<SRSLTE_TDEC_NPAR;cb++) {
+    data_rx_bytes[cb] = srslte_vec_malloc(frame_length * sizeof(uint8_t));
+    if (!data_rx_bytes[cb]) {
+      perror("malloc");
+      exit(-1);
+    }    
  }

  symbols = srslte_vec_malloc(coded_length * sizeof(uint8_t));
@ -232,7 +239,6 @@ int main(int argc, char **argv) {
      for (j = 0; j < coded_length; j++) {
        llr[j] = symbols[j] ? 1 : -1;
      }
-
      srslte_ch_awgn_f(llr, llr, var[i], coded_length);

      for (j=0;j<coded_length;j++) {
@ -248,23 +254,38 @@ int main(int argc, char **argv) {
        t = nof_iterations;
      }

+      int16_t *input[SRSLTE_TDEC_NPAR];
+      uint8_t *output[SRSLTE_TDEC_NPAR];        
+      
+      input[0] = llr_s; 
+      if (SRSLTE_TDEC_NPAR == 2)
+        input[1] = llr_s; 
+      
+      output[0] = data_rx_bytes[0]; 
+      if (SRSLTE_TDEC_NPAR == 2)
+        output[1] = data_rx_bytes[1]; 
+
      gettimeofday(&tdata[1], NULL); 
-      for (int k=0;k<nof_repetitions;k++) {     
-        srslte_tdec_run_all(&tdec, llr_s, data_rx_bytes, t, frame_length);        
+      for (int k=0;k<nof_repetitions;k++) { 
+        srslte_tdec_run_all_par(&tdec, input, output, t, nof_cb, frame_length);        
      }
      gettimeofday(&tdata[2], NULL);
      get_time_interval(tdata);
      mean_usec = (float) mean_usec * 0.9 + (float) (tdata[0].tv_usec/nof_repetitions) * 0.1;
      
-      srslte_bit_unpack_vector(data_rx_bytes, data_rx, frame_length);
-
-      errors += srslte_bit_diff(data_tx, data_rx, frame_length);
-      
      frame_cnt++;
+      uint32_t errors_this = 0; 
+      for (int cb=0;cb<nof_cb;cb++) {
+        srslte_bit_unpack_vector(data_rx_bytes[cb], data_rx, frame_length);
+        
+        errors_this=srslte_bit_diff(data_tx, data_rx, frame_length);
+        //printf("error[%d]=%d\n", cb, errors_this);
+        errors += errors_this;
+      }
      printf("Eb/No: %2.2f %10d/%d   ", SNR_MIN + i * ebno_inc, frame_cnt, nof_frames);
-      printf("BER: %.2e  ", (float) errors / (frame_cnt * frame_length));
-      printf("%3.1f Mbps (%6.2f usec)", (float) frame_length / mean_usec, mean_usec);
-      printf("\r");
+      printf("BER: %.2e  ", (float) errors / (nof_cb*frame_cnt * frame_length));
+      printf("%3.1f Mbps (%6.2f usec)", (float) SRSLTE_TDEC_NPAR*frame_length / mean_usec, mean_usec);
+      printf("\r");        

    }    
    printf("\n");
@ -273,7 +294,7 @@ int main(int argc, char **argv) {
  printf("\n");
  if (snr_points == 1) {
    if (errors) {
-      printf("%d Errors\n", errors);
+      printf("%d Errors\n", errors/nof_cb);
    }
  }    

--- a/lib/src/phy/fec/turbodecoder.c
+++ b/lib/src/phy/fec/turbodecoder.c
@ -35,7 +35,7 @@


 #ifdef LV_HAVE_SSE
-#include "srslte/phy/fec/turbodecoder_sse.h"
+#include "srslte/phy/fec/turbodecoder_simd.h"
 #endif

 #include "srslte/phy/utils/vector.h"
@ -43,7 +43,7 @@

 int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) {
 #ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_init(&h->tdec_sse, max_long_cb);
+  return srslte_tdec_simd_init(&h->tdec_simd, max_long_cb);
 #else
  h->input_conv = srslte_vec_malloc(sizeof(float) * (3*max_long_cb+12));
  if (!h->input_conv) {
@ -56,7 +56,7 @@ int srslte_tdec_init(srslte_tdec_t * h, uint32_t max_long_cb) {

 void srslte_tdec_free(srslte_tdec_t * h) {
 #ifdef LV_HAVE_SSE
-  srslte_tdec_sse_free(&h->tdec_sse);
+  srslte_tdec_simd_free(&h->tdec_simd);
 #else
  if (h->input_conv) {
    free(h->input_conv);
@ -68,45 +68,74 @@ void srslte_tdec_free(srslte_tdec_t * h) {

 int srslte_tdec_reset(srslte_tdec_t * h, uint32_t long_cb) {
 #ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_reset(&h->tdec_sse, long_cb);
+  return srslte_tdec_simd_reset(&h->tdec_simd, long_cb);
 #else
  return srslte_tdec_gen_reset(&h->tdec_gen, long_cb);
 #endif
 }

-void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) {
+void srslte_tdec_iteration_par(srslte_tdec_t * h, int16_t* input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) {
 #ifdef LV_HAVE_SSE
-  srslte_tdec_sse_iteration(&h->tdec_sse, input, long_cb);
+  srslte_tdec_simd_iteration(&h->tdec_simd, input, nof_cb, long_cb);
 #else
-  srslte_vec_convert_if(input, h->input_conv, 0.01, 3*long_cb+12);
+  srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12);
  srslte_tdec_gen_iteration(&h->tdec_gen, h->input_conv, long_cb);
 #endif
 }

-void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
+void srslte_tdec_iteration(srslte_tdec_t * h, int16_t* input, uint32_t long_cb) {
+  int16_t *input_par[SRSLTE_TDEC_NPAR]; 
+  input_par[0] = input; 
+  return srslte_tdec_iteration_par(h, input_par, 1, long_cb);
+}
+
+void srslte_tdec_decision_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) {
 #ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_decision(&h->tdec_sse, output, long_cb);
+  return srslte_tdec_simd_decision(&h->tdec_simd, output, nof_cb, long_cb);
 #else
-  return srslte_tdec_gen_decision(&h->tdec_gen, output, long_cb);
+  return srslte_tdec_gen_decision(&h->tdec_gen, output[0], long_cb);
 #endif
+}
+
+void srslte_tdec_decision(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
+  uint8_t *output_par[SRSLTE_TDEC_NPAR]; 
+  output_par[0] = output; 
+  srslte_tdec_decision_par(h, output_par, 1, long_cb);
+}

+void srslte_tdec_decision_byte_par(srslte_tdec_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb) {
+#ifdef LV_HAVE_SSE
+  srslte_tdec_simd_decision_byte(&h->tdec_simd, output, nof_cb, long_cb);
+#else
+  srslte_tdec_gen_decision_byte(&h->tdec_gen, output[0], long_cb);
+#endif  
 }

 void srslte_tdec_decision_byte(srslte_tdec_t * h, uint8_t *output, uint32_t long_cb) {
+  uint8_t *output_par[SRSLTE_TDEC_NPAR]; 
+  output_par[0] = output; 
+  srslte_tdec_decision_byte_par(h, output_par, 1, long_cb);
+}
+
+int srslte_tdec_run_all_par(srslte_tdec_t * h, int16_t * input[SRSLTE_TDEC_NPAR], 
+                            uint8_t *output[SRSLTE_TDEC_NPAR], 
+                            uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb) {
 #ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_decision_byte(&h->tdec_sse, output, long_cb);
+  return srslte_tdec_simd_run_all(&h->tdec_simd, input, output, nof_iterations, nof_cb, long_cb);
 #else
-  return srslte_tdec_gen_decision_byte(&h->tdec_gen, output, long_cb);
+  srslte_vec_convert_if(input[0], h->input_conv, 0.01, 3*long_cb+12);
+  return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output[0], nof_iterations, long_cb);
 #endif
-  
 }

 int srslte_tdec_run_all(srslte_tdec_t * h, int16_t * input, uint8_t *output, uint32_t nof_iterations, uint32_t long_cb)
 {
-#ifdef LV_HAVE_SSE
-  return srslte_tdec_sse_run_all(&h->tdec_sse, input, output, nof_iterations, long_cb);
-#else
-  srslte_vec_convert_if(input, h->input_conv, 0.01, 3*long_cb+12);
-  return srslte_tdec_gen_run_all(&h->tdec_gen, h->input_conv, output, nof_iterations, long_cb);
-#endif
+  uint8_t *output_par[SRSLTE_TDEC_NPAR]; 
+  output_par[0] = output;   
+  int16_t *input_par[SRSLTE_TDEC_NPAR]; 
+  input_par[0] = input; 
+ 
+  return srslte_tdec_run_all_par(h, input_par, output_par, nof_iterations, 1, long_cb);
 }
+
+
--- a/lib/src/phy/fec/turbodecoder_avx.c
+++ b/lib/src/phy/fec/turbodecoder_avx.c
@ -0,0 +1,401 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <math.h>
+
+#include "srslte/phy/fec/turbodecoder_simd.h"
+#include "srslte/phy/utils/vector.h"
+
+#include <inttypes.h>
+
+#define NUMSTATES       8
+#define NINPUTS         2
+#define TAIL            3
+#define TOTALTAIL       12
+
+#define INF 10000
+#define ZERO 0
+
+
+#ifdef LV_HAVE_AVX2
+
+#include <smmintrin.h>
+#include <immintrin.h>
+
+
+// Number of CB processed in parllel in AVX
+#define NCB 2 
+
+/*
+static void print_256i(__m256i x) {
+  int16_t *s = (int16_t*) &x; 
+  printf("[%d", s[0]);
+  for (int i=1;i<16;i++) {
+    printf(",%d", s[i]);
+  }
+  printf("]\n");
+}
+*/
+
+/* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
+static inline int16_t hMax0(__m256i masked_value)
+{
+  __m128i tmp1 = _mm256_extractf128_si256(masked_value, 0); 
+  __m128i tmp3 = _mm_minpos_epu16(tmp1);
+  return (int16_t)(_mm_cvtsi128_si32(tmp3));
+}
+
+static inline int16_t hMax1(__m256i masked_value)
+{
+  __m128i tmp1 = _mm256_extractf128_si256(masked_value, 1); 
+  __m128i tmp3 = _mm_minpos_epu16(tmp1);
+  return (int16_t)(_mm_cvtsi128_si32(tmp3));
+}
+
+/* Computes beta values */
+void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t long_cb)
+{
+  int k;
+  uint32_t end = long_cb + 3;
+  const __m256i *alphaPtr = (const __m256i*) s->alpha;
+ 
+  __m256i beta_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
+  __m256i g, bp, bn, alpha_k; 
+  
+  /* Define the shuffle constant for the positive beta */
+  __m256i shuf_bp = _mm256_set_epi8(
+    // 1st CB
+    15+16, 14+16, // 7
+    7+16,  6+16,  // 3
+    5+16,  4+16,  // 2
+    13+16, 12+16, // 6
+    11+16, 10+16, // 5
+    3+16,  2+16,  // 1
+    1+16,  0+16,  // 0
+    9+16,  8+16,  // 4
+
+    // 2nd CB
+    15, 14, // 7
+    7,  6,  // 3
+    5,  4,  // 2
+    13, 12, // 6
+    11, 10, // 5
+    3,  2,  // 1
+    1,  0,  // 0
+    9,  8   // 4
+  );
+
+  /* Define the shuffle constant for the negative beta */
+  __m256i shuf_bn = _mm256_set_epi8(
+    7+16, 6+16,   // 3
+    15+16, 14+16, // 7
+    13+16, 12+16, // 6
+    5+16,  4+16,  // 2
+    3+16,  2+16,  // 1
+    11+16, 10+16, // 5
+    9+16,  8+16,  // 4
+    1+16,  0+16,  // 0
+    
+    7,   6, // 3
+    15, 14, // 7
+    13, 12, // 6
+    5,  4,  // 2
+    3,  2,  // 1
+    11, 10, // 5
+    9,  8,  // 4
+    1,  0   // 0
+  );
+ 
+  alphaPtr += long_cb-1;
+
+  /* Define shuffle for branch costs */
+  __m256i shuf_g[4];
+  shuf_g[3] = _mm256_set_epi8(3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,3+16,2+16, 
+                              3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
+  shuf_g[2] = _mm256_set_epi8(7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,7+16,6+16,
+                              7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
+  shuf_g[1] = _mm256_set_epi8(11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,11+16,10+16,
+                              11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
+  shuf_g[0] = _mm256_set_epi8(15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,15+16,14+16,
+                              15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
+
+  /* Define shuffle for beta normalization */
+  __m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
+
+  __m256i gv;
+  int16_t *b = &s->branch[2*NCB*long_cb-16];
+  __m256i *gPtr = (__m256i*) b;
+
+  __m256i bn2, bp2; 
+  
+  /* This defines a beta computation step: 
+   * Adds and substracts the branch metrics to the previous beta step, 
+   * shuffles the states according to the trellis path and selects maximum state 
+   */
+#define BETA_STEP(g)     bp = _mm256_add_epi16(beta_k, g);\
+    bn = _mm256_sub_epi16(beta_k, g);\
+    bp = _mm256_shuffle_epi8(bp, shuf_bp);\
+    bn = _mm256_shuffle_epi8(bn, shuf_bn);\
+    beta_k = _mm256_max_epi16(bp, bn);    
+
+    /* Loads the alpha metrics from memory and adds them to the temporal bn and bp 
+     * metrics. Then computes horizontal maximum of both metrics and computes difference
+     */
+#define BETA_STEP_CNT(c,d) g = _mm256_shuffle_epi8(gv, shuf_g[c]);\
+    BETA_STEP(g)\
+    alpha_k = _mm256_load_si256(alphaPtr);\
+    alphaPtr--;\
+    bp = _mm256_add_epi16(bp, alpha_k);\
+    bn = _mm256_add_epi16(bn, alpha_k);\
+    bn2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bn);\
+    bp2 = _mm256_sub_epi8(_mm256_set1_epi16(0x7FFF), bp);\
+    output[0][k-d] = hMax0(bn2) - hMax0(bp2);\
+    output[1][k-d] = hMax1(bn2) - hMax1(bp2);
+
+  /* The tail does not require to load alpha or produce outputs. Only update 
+   * beta metrics accordingly */
+  for (k=end-1; k>=long_cb; k--) {
+    int16_t g0_1 = s->branch[2*NCB*k];
+    int16_t g1_1 = s->branch[2*NCB*k+1];
+    int16_t g0_2 = s->branch[2*NCB*k+6];
+    int16_t g1_2 = s->branch[2*NCB*k+6+1];
+    g = _mm256_set_epi16(g1_2, g0_2, g0_2, g1_2, g1_2, g0_2, g0_2, g1_2, g1_1, g0_1, g0_1, g1_1, g1_1, g0_1, g0_1, g1_1);
+    BETA_STEP(g);
+  }  
+  
+  /* We inline 2 trelis steps for each normalization */
+  __m256i norm;
+  for (; k >= 0; k-=8) {    
+    gv = _mm256_load_si256(gPtr);
+    gPtr--;
+    BETA_STEP_CNT(0,0);
+    BETA_STEP_CNT(1,1);
+    BETA_STEP_CNT(2,2);
+    BETA_STEP_CNT(3,3);
+    norm = _mm256_shuffle_epi8(beta_k, shuf_norm); 
+    beta_k = _mm256_sub_epi16(beta_k, norm);
+    gv = _mm256_load_si256(gPtr);
+    gPtr--;
+    BETA_STEP_CNT(0,4);
+    BETA_STEP_CNT(1,5);
+    BETA_STEP_CNT(2,6);
+    BETA_STEP_CNT(3,7);
+    norm = _mm256_shuffle_epi8(beta_k, shuf_norm); 
+    beta_k = _mm256_sub_epi16(beta_k, norm);
+  }  
+}
+
+/* Computes alpha metrics */
+void map_avx_alpha(map_gen_t * s, uint32_t long_cb)
+{
+  uint32_t k;
+  int16_t *alpha1 = s->alpha;
+  int16_t *alpha2 = &s->alpha[8];
+  uint32_t i;
+
+  alpha1[0] = 0; 
+  alpha2[0] = 0; 
+  for (i = 1; i < 8; i++) {
+    alpha1[i] = -INF;
+    alpha2[i] = -INF;
+  }
+    
+  /* Define the shuffle constant for the positive alpha */
+  __m256i shuf_ap = _mm256_set_epi8(
+
+    // 1st CB 
+    31, 30, // 7
+    25, 24, // 4
+    23, 22, // 3
+    17, 16, // 0
+    29, 28, // 6
+    27, 26, // 5
+    21, 20, // 2
+    19, 18, // 1
+
+    // 2nd CB
+    15, 14, // 7
+    9,  8,  // 4
+    7,  6,  // 3
+    1,  0,  // 0
+    13, 12, // 6
+    11, 10, // 5
+    5,  4,  // 2
+    3,  2   // 1
+  );
+
+  /* Define the shuffle constant for the negative alpha */
+  __m256i shuf_an = _mm256_set_epi8(
+
+    // 1nd CB 
+    29, 28, // 6
+    27, 26, // 5
+    21, 20, // 2
+    19, 18, // 1
+    31, 30, // 7
+    25, 24, // 4
+    23, 22, // 3
+    17, 16, // 0
+
+    // 2nd CB 
+    13, 12, // 6
+    11, 10, // 5
+    5,  4,  // 2
+    3,  2,  // 1
+    15, 14, // 7
+    9,  8,  // 4
+    7,  6,  // 3
+    1,  0   // 0
+  );
+  
+  /* Define shuffle for branch costs */
+  __m256i shuf_g[4];
+  shuf_g[0] = _mm256_set_epi8(3+16,2+16,3+16,2+16,1+16,0+16,1+16,0+16,1+16,0+16,1+16,0+16,3+16,2+16,3+16,2+16,       
+                              3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2);
+  shuf_g[1] = _mm256_set_epi8(7+16,6+16,7+16,6+16,5+16,4+16,5+16,4+16,5+16,4+16,5+16,4+16,7+16,6+16,7+16,6+16,    
+                              7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6);
+  shuf_g[2] = _mm256_set_epi8(11+16,10+16,11+16,10+16,9+16,8+16,9+16,8+16,9+16,8+16,9+16,8+16,11+16,10+16,11+16,10+16,       
+                              11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10);
+  shuf_g[3] = _mm256_set_epi8(15+16,14+16,15+16,14+16,13+16,12+16,13+16,12+16,13+16,12+16,13+16,12+16,15+16,14+16,15+16,14+16,
+                              15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14);
+
+  __m256i shuf_norm = _mm256_set_epi8(17,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
+  
+  __m256i* alphaPtr = (__m256i*) s->alpha;
+  alphaPtr++;
+
+  __m256i gv; 
+  __m256i *gPtr = (__m256i*) s->branch;
+  __m256i g, ap, an; 
+    
+  __m256i alpha_k = _mm256_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0, -INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
+  
+  /* This defines a alpha computation step: 
+   * Adds and substracts the branch metrics to the previous alpha step, 
+   * shuffles the states according to the trellis path and selects maximum state 
+   */
+#define ALPHA_STEP(c)  g = _mm256_shuffle_epi8(gv, shuf_g[c]); \
+  ap = _mm256_add_epi16(alpha_k, g);\
+  an = _mm256_sub_epi16(alpha_k, g);\
+  ap = _mm256_shuffle_epi8(ap, shuf_ap);\
+  an = _mm256_shuffle_epi8(an, shuf_an);\
+  alpha_k = _mm256_max_epi16(ap, an);\
+  _mm256_store_si256(alphaPtr, alpha_k);\
+  alphaPtr++;    \
+
+
+  /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */
+  __m256i norm;
+  for (k = 0; k < long_cb/8; k++) {
+    gv = _mm256_load_si256(gPtr);
+    
+    gPtr++;
+    ALPHA_STEP(0);
+    ALPHA_STEP(1);
+    ALPHA_STEP(2);
+    ALPHA_STEP(3);
+    norm = _mm256_shuffle_epi8(alpha_k, shuf_norm); 
+    alpha_k = _mm256_sub_epi16(alpha_k, norm);
+    gv = _mm256_load_si256(gPtr);
+    gPtr++;
+    ALPHA_STEP(0);
+    ALPHA_STEP(1);
+    ALPHA_STEP(2);
+    ALPHA_STEP(3);
+    norm = _mm256_shuffle_epi8(alpha_k, shuf_norm); 
+    alpha_k = _mm256_sub_epi16(alpha_k, norm);
+  }  
+}
+
+/* Compute branch metrics (gamma) */
+void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb) 
+{
+  __m128i res10, res20, res11, res21, res1, res2; 
+  __m128i in, ap, pa, g1, g0;
+
+  __m128i *inPtr  = (__m128i*) input;
+  __m128i *appPtr = (__m128i*) app;
+  __m128i *paPtr  = (__m128i*) parity;
+  __m128i *resPtr = (__m128i*) h->branch;
+  
+  if (cbidx) {
+    resPtr++;
+  }
+  
+  __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
+  __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
+  __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
+  __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
+
+  for (int i=0;i<long_cb/8;i++) {
+    in = _mm_load_si128(inPtr);
+    inPtr++;
+    pa = _mm_load_si128(paPtr);
+    paPtr++;
+    
+    if (appPtr) {
+      ap = _mm_load_si128(appPtr);
+      appPtr++;
+      in = _mm_add_epi16(ap, in);
+    }
+    
+    g1 = _mm_add_epi16(in, pa);
+    g0 = _mm_sub_epi16(in, pa);
+
+    g1 = _mm_srai_epi16(g1, 1);
+    g0 = _mm_srai_epi16(g0, 1);
+    
+    res10 = _mm_shuffle_epi8(g0, res10_mask);
+    res20 = _mm_shuffle_epi8(g0, res20_mask);
+    res11 = _mm_shuffle_epi8(g1, res11_mask);
+    res21 = _mm_shuffle_epi8(g1, res21_mask);
+
+    res1  = _mm_or_si128(res10, res11);
+    res2  = _mm_or_si128(res20, res21);
+
+    _mm_store_si128(resPtr, res1);
+    resPtr++;
+    resPtr++;
+    _mm_store_si128(resPtr, res2);    
+    resPtr++;
+    resPtr++;    
+  }
+    
+  for (int i=long_cb;i<long_cb+3;i++) {
+    h->branch[2*i*NCB+cbidx*6]   = (input[i] - parity[i])/2;
+    h->branch[2*i*NCB+cbidx*6+1] = (input[i] + parity[i])/2;
+  }
+}
+
+
+#endif
+
+
--- a/lib/src/phy/fec/turbodecoder_simd.c
+++ b/lib/src/phy/fec/turbodecoder_simd.c
@ -0,0 +1,486 @@
+/**
+ *
+ * \section COPYRIGHT
+ *
+ * Copyright 2013-2015 Software Radio Systems Limited
+ *
+ * \section LICENSE
+ *
+ * This file is part of the srsLTE library.
+ *
+ * srsLTE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * srsLTE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * A copy of the GNU Affero General Public License can be found in
+ * the LICENSE file in the top-level directory of this distribution
+ * and at http://www.gnu.org/licenses/.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <math.h>
+
+#include "srslte/phy/fec/turbodecoder_simd.h"
+#include "srslte/phy/utils/vector.h"
+
+#include <inttypes.h>
+
+#define NUMSTATES       8
+#define NINPUTS         2
+#define TAIL            3
+#define TOTALTAIL       12
+
+#define INF 10000
+#define ZERO 0
+
+
+#ifdef LV_HAVE_SSE
+#include <smmintrin.h>
+
+// Define SSE/AVX implementations 
+void map_sse_beta(map_gen_t * s, int16_t * output, uint32_t long_cb);
+void map_sse_alpha(map_gen_t * s, uint32_t long_cb);
+void map_sse_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb);
+
+#ifdef LV_HAVE_AVX2
+void map_avx_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t long_cb);
+void map_avx_alpha(map_gen_t * s, uint32_t long_cb);
+void map_avx_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t long_cb);
+#endif
+
+
+void map_simd_beta(map_gen_t * s, int16_t * output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
+{
+  if (nof_cb == 1) {
+    map_sse_beta(s, output[0], long_cb);
+  } 
+#ifdef LV_HAVE_AVX2
+  else if (nof_cb == 2) {
+    map_avx_beta(s, output, long_cb);
+  }
+#endif
+}
+
+void map_simd_alpha(map_gen_t * s, uint32_t nof_cb, uint32_t long_cb)
+{
+  if (nof_cb == 1) {
+    map_sse_alpha(s, long_cb);
+  }
+#ifdef LV_HAVE_AVX2
+  else if (nof_cb == 2) {
+    map_avx_alpha(s, long_cb);
+  }
+#endif
+}
+void map_simd_gamma(map_gen_t * s, int16_t *input, int16_t *app, int16_t *parity, uint32_t cbidx, uint32_t nof_cb, uint32_t long_cb) 
+{
+  if (nof_cb == 1) {
+    map_sse_gamma(s, input, app, parity, long_cb);
+  } 
+#ifdef LV_HAVE_AVX2
+  else if (nof_cb == 2) {
+    map_avx_gamma(s, input, app, parity, cbidx, long_cb);
+  }    
+#endif
+}
+
+/* Inititalizes constituent decoder object */
+int map_simd_init(map_gen_t * h, int max_long_cb)
+{
+  bzero(h, sizeof(map_gen_t));
+  h->alpha = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR);
+  if (!h->alpha) {
+    perror("srslte_vec_malloc");
+    return -1;
+  }
+  h->branch = srslte_vec_malloc(sizeof(int16_t) * (max_long_cb + SRSLTE_TCOD_TOTALTAIL + 1) * NUMSTATES * SRSLTE_TDEC_NPAR);
+  if (!h->branch) {
+    perror("srslte_vec_malloc");
+    return -1;
+  }
+  h->max_long_cb = max_long_cb;
+  return 0;
+}
+
+void map_simd_free(map_gen_t * h)
+{
+  if (h->alpha) {
+    free(h->alpha);
+  }
+  if (h->branch) {
+    free(h->branch);
+  }
+  bzero(h, sizeof(map_gen_t));
+}
+
+/* Runs one instance of a decoder */
+void map_simd_dec(map_gen_t * h, int16_t * input[SRSLTE_TDEC_NPAR], int16_t *app[SRSLTE_TDEC_NPAR], int16_t * parity[SRSLTE_TDEC_NPAR], 
+                  int16_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
+{
+ 
+  // Compute branch metrics
+  for (int i=0;i<nof_cb;i++) {
+    map_simd_gamma(h, input[i], app?app[i]:NULL, parity[i], i, nof_cb, long_cb);    
+  }
+
+  // Forward recursion
+  map_simd_alpha(h, nof_cb, long_cb);
+
+  // Backwards recursion + LLR computation
+  map_simd_beta(h, output, nof_cb, long_cb);
+}
+
+/* Initializes the turbo decoder object */
+int srslte_tdec_simd_init(srslte_tdec_simd_t * h, uint32_t max_long_cb)
+{
+  int ret = -1;
+  bzero(h, sizeof(srslte_tdec_simd_t));
+  uint32_t len = max_long_cb + SRSLTE_TCOD_TOTALTAIL;
+
+  h->max_long_cb = max_long_cb;
+
+  for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
+    h->app1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->app1[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->app2[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->app2[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->ext1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->ext1[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->ext2[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->ext2[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->syst[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->syst[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->parity0[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->parity0[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    h->parity1[i] = srslte_vec_malloc(sizeof(int16_t) * len);
+    if (!h->parity1[i]) {
+      perror("srslte_vec_malloc");
+      goto clean_and_exit;
+    }
+    
+  }
+
+  if (map_simd_init(&h->dec, h->max_long_cb)) {
+    goto clean_and_exit;
+  }
+
+  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
+    if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
+      goto clean_and_exit;
+    }
+    srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
+  }
+  h->current_cbidx = -1; 
+  ret = 0;
+clean_and_exit:if (ret == -1) {
+    srslte_tdec_simd_free(h);
+  }
+  return ret;
+}
+
+void srslte_tdec_simd_free(srslte_tdec_simd_t * h)
+{
+  for (int i=0;i<SRSLTE_TDEC_NPAR;i++) {
+    if (h->app1[i]) {
+      free(h->app1[i]);
+    }
+    if (h->app2[i]) {
+      free(h->app2[i]);
+    }
+    if (h->ext1[i]) {
+      free(h->ext1[i]);
+    }
+    if (h->ext2[i]) {
+      free(h->ext2[i]);
+    }
+    if (h->syst[i]) {
+      free(h->syst[i]);
+    }
+    if (h->parity0[i]) {
+      free(h->parity0[i]);
+    }
+    if (h->parity1[i]) {
+      free(h->parity1[i]);
+    }    
+  }
+
+  map_simd_free(&h->dec);
+
+  for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
+    srslte_tc_interl_free(&h->interleaver[i]);    
+  }
+
+  bzero(h, sizeof(srslte_tdec_simd_t));
+}
+
+/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into 
+ * 3 buffers ready to be used by compute_gamma() 
+ */
+void deinterleave_input_simd(srslte_tdec_simd_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb) {
+  uint32_t i;
+ 
+  __m128i *inputPtr = (__m128i*) input; 
+  __m128i in0, in1, in2;
+  __m128i s0, s1, s2, s;
+  __m128i p00, p01, p02, p0;
+  __m128i p10, p11, p12, p1;
+  
+  __m128i *sysPtr = (__m128i*) h->syst[cbidx]; 
+  __m128i *pa0Ptr = (__m128i*) h->parity0[cbidx]; 
+  __m128i *pa1Ptr = (__m128i*) h->parity1[cbidx]; 
+  
+  // pick bits 0, 3, 6 from 1st word
+  __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
+  // pick bits 1, 4, 7 from 2st word
+  __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
+  // pick bits 2, 5 from 3rd word
+  __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+
+  // pick bits 1, 4, 7 from 1st word
+  __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
+  // pick bits 2, 5, from 2st word
+  __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
+  // pick bits 0, 3, 6 from 3rd word
+  __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+  
+  // pick bits 2, 5 from 1st word
+  __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
+  // pick bits 0, 3, 6, from 2st word
+  __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
+  // pick bits 1, 4, 7 from 3rd word
+  __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
+      
+  // Split systematic and parity bits
+  for (i = 0; i < long_cb/8; i++) {
+        
+    in0 = _mm_load_si128(inputPtr); inputPtr++; 
+    in1 = _mm_load_si128(inputPtr); inputPtr++;   
+    in2 = _mm_load_si128(inputPtr); inputPtr++;
+    
+    /* Deinterleave Systematic bits */
+    s0 = _mm_shuffle_epi8(in0, s0_mask);
+    s1 = _mm_shuffle_epi8(in1, s1_mask);
+    s2 = _mm_shuffle_epi8(in2, s2_mask);    
+    s = _mm_or_si128(s0, s1);
+    s = _mm_or_si128(s, s2);
+
+    _mm_store_si128(sysPtr, s);
+    sysPtr++;
+
+    /* Deinterleave parity 0 bits */
+    p00 = _mm_shuffle_epi8(in0, p00_mask);
+    p01 = _mm_shuffle_epi8(in1, p01_mask);
+    p02 = _mm_shuffle_epi8(in2, p02_mask);    
+    p0 = _mm_or_si128(p00, p01);
+    p0 = _mm_or_si128(p0, p02);
+    
+    _mm_store_si128(pa0Ptr, p0);
+    pa0Ptr++;
+
+    /* Deinterleave parity 1 bits */
+    p10 = _mm_shuffle_epi8(in0, p10_mask);
+    p11 = _mm_shuffle_epi8(in1, p11_mask);
+    p12 = _mm_shuffle_epi8(in2, p12_mask);    
+    p1 = _mm_or_si128(p10, p11);
+    p1 = _mm_or_si128(p1, p12);
+
+    _mm_store_si128(pa1Ptr, p1);
+    pa1Ptr++;    
+
+  }
+  
+  for (i = 0; i < 3; i++) {
+    h->syst[cbidx][i+long_cb]    = input[3*long_cb + 2*i];
+    h->parity0[cbidx][i+long_cb] = input[3*long_cb + 2*i + 1];
+  }
+  for (i = 0; i < 3; i++) {
+    h->app2[cbidx][i+long_cb]    = input[3*long_cb + 6 + 2*i];
+    h->parity1[cbidx][i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
+  }
+
+}
+
+/* Runs 1 turbo decoder iteration */
+void srslte_tdec_simd_iteration(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
+{
+
+  if (h->current_cbidx >= 0) {
+    uint16_t *inter   = h->interleaver[h->current_cbidx].forward;
+    uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
+    
+    if (h->n_iter == 0) {
+      for (int i=0;i<nof_cb;i++) {
+        deinterleave_input_simd(h, input[i], i, long_cb);        
+      }        
+    }
+    
+    // Add apriori information to decoder 1 
+    if (h->n_iter > 0) {
+      for (int i=0;i<nof_cb;i++) {
+        srslte_vec_sub_sss(h->app1[i], h->ext1[i], h->app1[i], long_cb);
+      }
+    }
+
+    // Run MAP DEC #1
+    if (h->n_iter == 0) {
+      map_simd_dec(&h->dec, h->syst, NULL, h->parity0, h->ext1, nof_cb, long_cb);            
+    } else {
+      map_simd_dec(&h->dec, h->syst, h->app1, h->parity0, h->ext1, nof_cb, long_cb);      
+    }
+    
+    // Convert aposteriori information into extrinsic information    
+    if (h->n_iter > 0) {
+      for (int i=0;i<nof_cb;i++) {
+        srslte_vec_sub_sss(h->ext1[i], h->app1[i], h->ext1[i], long_cb);
+      }
+    }
+    
+    // Interleave extrinsic output of DEC1 to form apriori info for decoder 2
+    for (int i=0;i<nof_cb;i++) {
+      srslte_vec_lut_sss(h->ext1[i], deinter, h->app2[i], long_cb);
+    }
+
+    // Run MAP DEC #2. 2nd decoder uses apriori information as systematic bits
+    map_simd_dec(&h->dec, h->app2, NULL, h->parity1, h->ext2, nof_cb, long_cb);
+
+    // Deinterleaved extrinsic bits become apriori info for decoder 1 
+    for (int i=0;i<nof_cb;i++) {
+      srslte_vec_lut_sss(h->ext2[i], inter, h->app1[i], long_cb);
+    }
+
+    h->n_iter++;
+  } else {
+    fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_reset() first\n");    
+  }
+}
+
+/* Resets the decoder and sets the codeblock length */
+int srslte_tdec_simd_reset(srslte_tdec_simd_t * h, uint32_t long_cb)
+{
+  if (long_cb > h->max_long_cb) {
+    fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
+            h->max_long_cb);
+    return -1;
+  }
+  h->n_iter = 0; 
+  h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
+  if (h->current_cbidx < 0) {
+    fprintf(stderr, "Invalid CB length %d\n", long_cb);
+    return -1; 
+  }
+  return 0;
+}
+
+void tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
+{
+  __m128i zero     = _mm_set1_epi16(0);
+  __m128i lsb_mask = _mm_set1_epi16(1);
+  
+  __m128i *appPtr = (__m128i*) h->app1[cbidx];
+  __m128i *outPtr = (__m128i*) output;
+  __m128i ap, out, out0, out1; 
+    
+  for (uint32_t i = 0; i < long_cb/16; i++) {
+    ap   = _mm_load_si128(appPtr); appPtr++;    
+    out0 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
+    ap   = _mm_load_si128(appPtr); appPtr++;
+    out1 = _mm_and_si128(_mm_cmpgt_epi16(ap, zero), lsb_mask);
+    
+    out  = _mm_packs_epi16(out0, out1);
+    _mm_store_si128(outPtr, out);
+    outPtr++;
+  }
+  if (long_cb%16) {
+    for (int i=0;i<8;i++) {
+      output[long_cb-8+i] = h->app1[cbidx][long_cb-8+i]>0?1:0;
+    }
+  }
+}
+
+void srslte_tdec_simd_decision(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
+{
+  for (int i=0;i<nof_cb;i++) {
+    tdec_simd_decision(h, output[i], i, long_cb);
+  }
+}
+
+void tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output, uint32_t cbidx, uint32_t long_cb)
+{
+  uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
+  
+  // long_cb is always byte aligned
+  for (uint32_t i = 0; i < long_cb/8; i++) {
+    uint8_t out0 = h->app1[cbidx][8*i+0]>0?mask[0]:0;
+    uint8_t out1 = h->app1[cbidx][8*i+1]>0?mask[1]:0;
+    uint8_t out2 = h->app1[cbidx][8*i+2]>0?mask[2]:0;
+    uint8_t out3 = h->app1[cbidx][8*i+3]>0?mask[3]:0;
+    uint8_t out4 = h->app1[cbidx][8*i+4]>0?mask[4]:0;
+    uint8_t out5 = h->app1[cbidx][8*i+5]>0?mask[5]:0;
+    uint8_t out6 = h->app1[cbidx][8*i+6]>0?mask[6]:0;
+    uint8_t out7 = h->app1[cbidx][8*i+7]>0?mask[7]:0;
+    
+    output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
+    
+    //if (i<10) {
+    //  printf("output[%d]=%d\n",i,output[i]);
+    //}
+  }
+}
+
+void srslte_tdec_simd_decision_byte(srslte_tdec_simd_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
+{
+  for (int i=0;i<nof_cb;i++) {
+    tdec_simd_decision_byte(h, output[i], i, long_cb);
+  }
+}
+
+
+/* Runs nof_iterations iterations and decides the output bits */
+int srslte_tdec_simd_run_all(srslte_tdec_simd_t * h, int16_t * input[SRSLTE_TDEC_NPAR], uint8_t *output[SRSLTE_TDEC_NPAR],
+                            uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb)
+{
+  if (srslte_tdec_simd_reset(h, long_cb)) {
+    return SRSLTE_ERROR; 
+  }
+
+  do {
+    srslte_tdec_simd_iteration(h, input, nof_cb, long_cb);
+  } while (h->n_iter < nof_iterations);
+
+  srslte_tdec_simd_decision_byte(h, output, nof_cb, long_cb);
+  
+  return SRSLTE_SUCCESS;
+}
+
+#endif
+
+
--- a/lib/src/phy/fec/turbodecoder_sse.c
+++ b/lib/src/phy/fec/turbodecoder_sse.c
@ -52,6 +52,17 @@

 #ifdef LV_HAVE_SSE

+/*
+static void print_128i(__m128i x) {
+  int16_t *s = (int16_t*) &x; 
+  printf("[%d", s[0]);
+  for (int i=1;i<8;i++) {
+    printf(",%d", s[i]);
+  }
+  printf("]\n");
+}
+*/
+
 /* Computes the horizontal MAX from 8 16-bit integers using the minpos_epu16 SSE4.1 instruction */
 static inline int16_t hMax(__m128i buffer)
 {
@ -126,7 +137,8 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
    alpha_k = _mm_load_si128(alphaPtr);\
    alphaPtr--;\
    bp = _mm_add_epi16(bp, alpha_k);\
-    bn = _mm_add_epi16(bn, alpha_k); output[k-d] = hMax(bn) - hMax(bp);
+    bn = _mm_add_epi16(bn, alpha_k);\
+    output[k-d] = hMax(bn) - hMax(bp);

  /* The tail does not require to load alpha or produce outputs. Only update 
   * beta metrics accordingly */
@ -134,7 +146,6 @@ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
    int16_t g0 = s->branch[2*k];
    int16_t g1 = s->branch[2*k+1];
    g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1);
-  
    BETA_STEP(g);
  }