diff --git a/lib/include/srslte/phy/fec/ldpc/base_graph.h b/lib/include/srslte/phy/fec/ldpc/base_graph.h
index c068dd662..2ccc61bc9 100644
--- a/lib/include/srslte/phy/fec/ldpc/base_graph.h
+++ b/lib/include/srslte/phy/fec/ldpc/base_graph.h
@@ -34,7 +34,9 @@
 
 #define SRSLTE_LDPC_BG1_MAX_LEN_CB 8448 /*!< \brief Maximum code block size for LDPC BG1 */
 #define SRSLTE_LDPC_BG2_MAX_LEN_CB 3840 /*!< \brief Maximum code block size for LDPC BG2 */
-#define SRSLTE_LDPC_MAX_LEN_CB SRSLTE_MAX(SRSLTE_LDPC_BG1_MAX_LEN_CB, SRSLTE_LDPC_BG2_MAX_LEN_CB)
+#define SRSLTE_LDPC_MAX_LEN_CB                                                                                         \
+  SRSLTE_MAX(SRSLTE_LDPC_BG1_MAX_LEN_CB,                                                                               \
+             SRSLTE_LDPC_BG2_MAX_LEN_CB) /*!< \brief Maximum code block size for LDPC BG1 or BG2 */
 
 #define BG1Nfull 68 /*!< \brief Number of variable nodes in BG1. */
 #define BG1N 66     /*!< \brief Number of variable nodes in BG1 after puncturing. */
diff --git a/lib/include/srslte/phy/fec/ldpc/ldpc_rm.h b/lib/include/srslte/phy/fec/ldpc/ldpc_rm.h
index 7cd84ab03..cf5b87c9e 100644
--- a/lib/include/srslte/phy/fec/ldpc/ldpc_rm.h
+++ b/lib/include/srslte/phy/fec/ldpc/ldpc_rm.h
@@ -133,8 +133,7 @@ SRSLTE_API int srslte_ldpc_rm_rx_init_s(srslte_ldpc_rm_t* q);
  * \param[in] rv          Redundancy version 0,1,2,3.
  * \param[in] mod_type    Modulation type.
  * \param[in] Nref        Size of limited buffer.
- * \param[out] output The rate-dematched codeword resulting from the rate-dematching
- *    operation.
+ * \return An integer: 0 if the function executes correctly, -1 otherwise.
  */
 SRSLTE_API int srslte_ldpc_rm_rx_s(srslte_ldpc_rm_t*        q,
                                    const int16_t*           input,
diff --git a/lib/src/phy/fec/ldpc/ldpc_dec_all.h b/lib/src/phy/fec/ldpc/ldpc_dec_all.h
index 145f89db9..9387b8f57 100644
--- a/lib/src/phy/fec/ldpc/ldpc_dec_all.h
+++ b/lib/src/phy/fec/ldpc/ldpc_dec_all.h
@@ -366,7 +366,8 @@ int update_ldpc_soft_bits_c_avx2(void* p, int i_layer, const int8_t (*these_var_
 
 /*!
  * Returns the decoded message (hard bits) from the current soft bits (optimized 8-bit version, LS <= \ref
- * SRSLTE_AVX2_B_SIZE). \param[in]  p       A pointer to the decoder registers (an ldpc_regs_c_avx2 structure).
+ * SRSLTE_AVX2_B_SIZE).
+ * \param[in]  p       A pointer to the decoder registers (an ldpc_regs_c_avx2 structure).
  * \param[out] message A pointer to the decoded message.
  * \param[in]  liftK   The length of the decoded message.
  * \return An integer: 0 if the function executes correctly, -1 otherwise.
@@ -375,7 +376,8 @@ int extract_ldpc_message_c_avx2(void* p, uint8_t* message, uint16_t liftK);
 
 /*!
  * Creates the registers used by the optimized 8-bit-based implementation of the LDPC decoder (LS > \ref
- * SRSLTE_AVX2_B_SIZE). \param[in] bgN          Codeword length. \param[in] bgM          Number of check nodes.
+ * SRSLTE_AVX2_B_SIZE).
+ * \param[in] bgN          Codeword length. \param[in] bgM          Number of check nodes.
  * \param[in] ls           Lifting size. \param[in] scaling_fctr Scaling factor of the normalized min-sum algorithm.
  * \return A pointer to the created registers (an ldpc_regs_c_avx2long structure).
  */
diff --git a/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2.c b/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2.c
index 1bb973feb..e7ac42de3 100644
--- a/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2.c
+++ b/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2.c
@@ -180,7 +180,8 @@ void* create_ldpc_dec_c_avx2(uint8_t bgN, uint8_t bgM, uint16_t ls, float scalin
   vp->hrr = hrr;
   vp->ls  = ls;
 
-  vp->scaling_fctr = _mm256_set1_epi16((uint16_t)(scaling_fctr * F2I));
+  // correction > 1/16 to compensate the scaling error (2^16-1)/2^16 incurred in _mm256_scalei_epi8
+  vp->scaling_fctr = _mm256_set1_epi16((uint16_t)((scaling_fctr + 0.00001525879) * F2I));
 
   return vp;
 }
diff --git a/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2_flood.c b/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2_flood.c
index 0ee5f1fdb..0975891f8 100644
--- a/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2_flood.c
+++ b/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2_flood.c
@@ -190,7 +190,8 @@ void* create_ldpc_dec_c_avx2_flood(uint8_t bgN, uint8_t bgM, uint16_t ls, float
   vp->hrr = hrr;
   vp->ls  = ls;
 
-  vp->scaling_fctr = _mm256_set1_epi16((uint16_t)(scaling_fctr * F2I));
+  // correction > 1/16 to compensate the scaling error (2^16-1)/2^16 incurred in _mm256_scalei_epi8
+  vp->scaling_fctr = _mm256_set1_epi16((uint16_t)((scaling_fctr + 0.00001525879) * F2I));
 
   return vp;
 }
diff --git a/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2long.c b/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2long.c
index 9626ae3a0..9ccc203d6 100644
--- a/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2long.c
+++ b/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2long.c
@@ -227,7 +227,8 @@ void* create_ldpc_dec_c_avx2long(uint8_t bgN, uint8_t bgM, uint16_t ls, float sc
 
   vp->n_subnodes = n_subnodes;
 
-  vp->scaling_fctr = _mm256_set1_epi16((uint16_t)(scaling_fctr * F2I));
+  // correction > 1/16 to compensate the scaling error (2^16-1)/2^16 incurred in _mm256_scalei_epi8
+  vp->scaling_fctr = _mm256_set1_epi16((uint16_t)((scaling_fctr + 0.00001525879) * F2I));
 
   return vp;
 }
diff --git a/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2long_flood.c b/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2long_flood.c
index d50389b36..920b853ba 100644
--- a/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2long_flood.c
+++ b/lib/src/phy/fec/ldpc/ldpc_dec_c_avx2long_flood.c
@@ -240,7 +240,8 @@ void* create_ldpc_dec_c_avx2long_flood(uint8_t bgN, uint8_t bgM, uint16_t ls, fl
 
   vp->n_subnodes = n_subnodes;
 
-  vp->scaling_fctr = _mm256_set1_epi16((uint16_t)(scaling_fctr * F2I));
+  // correction > 1/16 to compensate the scaling error (2^16-1)/2^16 incurred in _mm256_scalei_epi8
+  vp->scaling_fctr = _mm256_set1_epi16((uint16_t)((scaling_fctr + 0.00001525879) * F2I));
 
   return vp;
 }
diff --git a/lib/src/phy/fec/ldpc/ldpc_enc_all.h b/lib/src/phy/fec/ldpc/ldpc_enc_all.h
index d4c5c969c..38ce30a60 100644
--- a/lib/src/phy/fec/ldpc/ldpc_enc_all.h
+++ b/lib/src/phy/fec/ldpc/ldpc_enc_all.h
@@ -108,17 +108,17 @@ void preprocess_systematic_bits_avx2(srslte_ldpc_encoder_t* q);
 void encode_high_rate_case1_avx2(void* o);
 
 /*! Computes the high-rate parity bits for BG1 and ls_index in {6} (SIMD-optimized version, LS <= \ref
- * SRSLTE_AVX2_B_SIZE). \param[in,out]  q  A pointer to an encoder.
+ * SRSLTE_AVX2_B_SIZE). \param[in,out]  o  A pointer to an encoder.
  */
 void encode_high_rate_case2_avx2(void* o);
 
 /*! Computes the high-rate parity bits for BG2 and ls_index in {0, 1, 2, 4, 5, 6} (SIMD-optimized version, LS <= \ref
- * SRSLTE_AVX2_B_SIZE). \param[in,out]  q  A pointer to an encoder.
+ * SRSLTE_AVX2_B_SIZE). \param[in,out]  o  A pointer to an encoder.
  */
 void encode_high_rate_case3_avx2(void* o);
 
 /*! Computes the high-rate parity bits for BG2 and ls_index in {3, 7} (SIMD-optimized version, LS <= \ref
- * SRSLTE_AVX2_B_SIZE). \param[in,out]  q  A pointer to an encoder.
+ * SRSLTE_AVX2_B_SIZE). \param[in,out]  o  A pointer to an encoder.
  */
 void encode_high_rate_case4_avx2(void* o);
 
diff --git a/lib/src/phy/fec/ldpc/ldpc_enc_c.c b/lib/src/phy/fec/ldpc/ldpc_enc_c.c
index 1650bdbc4..0b3c8a427 100644
--- a/lib/src/phy/fec/ldpc/ldpc_enc_c.c
+++ b/lib/src/phy/fec/ldpc/ldpc_enc_c.c
@@ -126,9 +126,10 @@ void encode_high_rate_case1(void* q_, uint8_t* output)
   }
 }
 
-void encode_high_rate_case2(srslte_ldpc_encoder_t* q, uint8_t* output)
+void encode_high_rate_case2(void* q_, uint8_t* output)
 {
-  uint8_t(*aux)[q->ls] = q->ptr;
+  srslte_ldpc_encoder_t* q = (srslte_ldpc_encoder_t*)q_;
+  uint8_t(*aux)[q->ls]     = q->ptr;
 
   int ls = q->ls;
   int i  = 0;
@@ -155,9 +156,10 @@ void encode_high_rate_case2(srslte_ldpc_encoder_t* q, uint8_t* output)
   }
 }
 
-void encode_high_rate_case3(srslte_ldpc_encoder_t* q, uint8_t* output)
+void encode_high_rate_case3(void* q_, uint8_t* output)
 {
-  uint8_t(*aux)[q->ls] = q->ptr;
+  srslte_ldpc_encoder_t* q = (srslte_ldpc_encoder_t*)q_;
+  uint8_t(*aux)[q->ls]     = q->ptr;
 
   int ls = q->ls;
   int i  = 0;
@@ -184,9 +186,10 @@ void encode_high_rate_case3(srslte_ldpc_encoder_t* q, uint8_t* output)
   }
 }
 
-void encode_high_rate_case4(srslte_ldpc_encoder_t* q, uint8_t* output)
+void encode_high_rate_case4(void* q_, uint8_t* output)
 {
-  uint8_t(*aux)[q->ls] = q->ptr;
+  srslte_ldpc_encoder_t* q = (srslte_ldpc_encoder_t*)q_;
+  uint8_t(*aux)[q->ls]     = q->ptr;
 
   int ls = q->ls;
   int k  = 0;
diff --git a/lib/src/phy/fec/ldpc/test/ldpc_rm_chain_test.c b/lib/src/phy/fec/ldpc/test/ldpc_rm_chain_test.c
index 3c648466b..1cf7f3340 100644
--- a/lib/src/phy/fec/ldpc/test/ldpc_rm_chain_test.c
+++ b/lib/src/phy/fec/ldpc/test/ldpc_rm_chain_test.c
@@ -35,6 +35,7 @@
  *  - **-B \<number\>** Number of codewords in a batch.(Default 100).
  *  - **-N \<number\>** Max number of simulated batches.(Default 10000).
  *  - **-E \<number\>** Minimum number of errors for a significant simulation.(Default 100).
+ *  - **-w \<number\>** Rate-matching aware encoding/decoding [(0 or 1)]
  */
 
 #include <stdio.h>
@@ -54,11 +55,12 @@
 static srslte_basegraph_t base_graph = BG1;     /*!< \brief Base Graph (BG1 or BG2). */
 static uint32_t           lift_size  = 2;       /*!< \brief Lifting Size. */
 static uint32_t           rm_length  = 0;       /*!< \brief Codeword length after rate matching. */
-static uint32_t           F          = 22 - 5;  /*!< \brief Number of filler bits in each CBS. */
+static uint32_t           F          = 0;       /*!< \brief Number of filler bits in each CBS. */
 static uint8_t            rv         = 0;       /*!< \brief Redundancy version {0-3}. */
 static srslte_mod_t mod_type = SRSLTE_MOD_BPSK; /*!< \brief Modulation type: BPSK, QPSK, QAM16, QAM64, QAM256 = 4 */
 static uint32_t     Nref     = 0;               /*!< \brief Limited buffer size. */
 static float        snr      = 0;               /*!< \brief Signal-to-Noise Ratio [dB]. */
+static uint8_t            rm_aware = 1; /*!< \brief Flag rate matching aware encoding/decoding (1 to enable). */
 
 static int finalK = 0; /*!< \brief Number of uncoded bits (message length, including punctured and filler bits). */
 static int finalN = 0; /*!< \brief Number of coded bits (codeword length). */
@@ -74,7 +76,7 @@ static int req_errors  = 100;   /*!< \brief Minimum number of errors for a signi
 void usage(char* prog)
 {
 
-  printf("Usage: %s [-bX] [-lX] [-eX] [-fX] [-rX] [-mX] [-MX] [sX]\n", prog);
+  printf("Usage: %s [-bX] [-lX] [-eX] [-fX] [-rX] [-mX] [-MX] [-wX] [-sX]\n", prog);
   printf("\t-b Base Graph [(1 or 2) Default %d]\n", base_graph + 1);
   printf("\t-l Lifting Size [Default %d]\n", lift_size);
   printf("\t-e Word length after rate matching [Default %d (no rate matching i.e. E = N - F)]\n", rm_length);
@@ -86,6 +88,7 @@ void usage(char* prog)
   printf("\t-B Number of codewords in a batch. [Default %d]\n", batch_size);
   printf("\t-N Max number of simulated batches. [Default %d]\n", max_n_batch);
   printf("\t-E Minimum number of errors for a significant simulation. [Default %d]\n", req_errors);
+  printf("\t-w Rate-matching aware encoding/decoding [(0 or 1) Default = %d (normal buffer Nref = N)]\n", rm_aware);
 }
 
 /*!
@@ -94,7 +97,7 @@ void usage(char* prog)
 void parse_args(int argc, char** argv)
 {
   int opt = 0;
-  while ((opt = getopt(argc, argv, "b:l:e:f:r:m:M:s:B:N:E:")) != -1) {
+  while ((opt = getopt(argc, argv, "b:l:e:f:r:m:w:M:s:B:N:E:")) != -1) {
     switch (opt) {
       case 'b':
         base_graph = (int)strtol(optarg, NULL, 10) - 1;
@@ -117,6 +120,9 @@ void parse_args(int argc, char** argv)
       case 'M':
         Nref = (uint32_t)strtol(optarg, NULL, 10);
         break;
+      case 'w':
+        rm_aware = (uint8_t)strtol(optarg, NULL, 10);
+        break;
       case 's':
         snr = (float)strtod(optarg, NULL);
         break;
@@ -338,6 +344,15 @@ int main(int argc, char** argv)
   int8_t inf7   = (1U << 6U) - 1;
   float  gain_c = inf7 * noise_std_dev / 8 / (1 / noise_std_dev + 2);
 
+  // RM aware LDPC Encoding
+  // compute the number of symbols that we need to encode/decode: at least (rm_length + F) if rm_length +F < N,
+  unsigned int n_useful_symbols_enc = finalN;
+  unsigned int n_useful_symbols_dec = finalN;
+  if (rm_aware > 0) {
+    n_useful_symbols_enc = (rm_length + F); // if n_useful_symbols > N, the encoder set n_useful_symbols = finalN;
+    n_useful_symbols_dec = (rm_length + F); // if n_useful_symbols > N, the encoder set n_useful_symbols = finalN;
+  }
+
   printf("\nBatch:\n  ");
 
   while (((n_error_words_f < req_errors) || (n_error_words_s < req_errors) || (n_error_words_c < req_errors)) &&
@@ -363,7 +378,8 @@ int main(int argc, char** argv)
 
     gettimeofday(&t[1], NULL);
     for (j = 0; j < batch_size; j++) {
-      srslte_ldpc_encoder_encode(&encoder, messages_true + j * finalK, codewords + j * finalN, finalK);
+      srslte_ldpc_encoder_encode_rm(
+          &encoder, messages_true + j * finalK, codewords + j * finalN, finalK, n_useful_symbols_enc);
     }
     gettimeofday(&t[2], NULL);
     get_time_interval(t);
@@ -425,12 +441,11 @@ int main(int argc, char** argv)
     // Recover messages
     gettimeofday(&t[1], NULL);
     for (j = 0; j < batch_size; j++) {
-      srslte_ldpc_decoder_decode_f(&decoder_f, symbols + j * finalN, messages_sim_f + j * finalK, finalN);
+      srslte_ldpc_decoder_decode_f(&decoder_f, symbols + j * finalN, messages_sim_f + j * finalK, n_useful_symbols_dec);
     }
     gettimeofday(&t[2], NULL);
     get_time_interval(t);
     elapsed_time_dec_f += t[0].tv_sec + 1e-6 * t[0].tv_usec;
-
     for (i = 0; i < batch_size; i++) {
       for (j = 0; j < finalK; j++) {
         i_bit = i * finalK + j;
@@ -465,7 +480,8 @@ int main(int argc, char** argv)
     // Recover messages
     gettimeofday(&t[1], NULL);
     for (j = 0; j < batch_size; j++) {
-      srslte_ldpc_decoder_decode_s(&decoder_s, symbols_s + j * finalN, messages_sim_s + j * finalK, finalN);
+      srslte_ldpc_decoder_decode_s(
+          &decoder_s, symbols_s + j * finalN, messages_sim_s + j * finalK, n_useful_symbols_dec);
     }
     gettimeofday(&t[2], NULL);
     get_time_interval(t);
@@ -504,7 +520,8 @@ int main(int argc, char** argv)
     // Recover messages
     gettimeofday(&t[1], NULL);
     for (j = 0; j < batch_size; j++) {
-      srslte_ldpc_decoder_decode_rm_c(&decoder_c, symbols_c + j * finalN, messages_sim_c + j * finalK, finalN);
+      srslte_ldpc_decoder_decode_rm_c(
+          &decoder_c, symbols_c + j * finalN, messages_sim_c + j * finalK, n_useful_symbols_dec);
     }
     gettimeofday(&t[2], NULL);
     get_time_interval(t);
@@ -526,7 +543,7 @@ int main(int argc, char** argv)
     gettimeofday(&t[1], NULL);
     for (j = 0; j < batch_size; j++) {
       srslte_ldpc_decoder_decode_rm_c(
-          &decoder_c_flood, symbols_c + j * finalN, messages_sim_c_flood + j * finalK, finalN);
+          &decoder_c_flood, symbols_c + j * finalN, messages_sim_c_flood + j * finalK, n_useful_symbols_dec);
     }
     gettimeofday(&t[2], NULL);
     get_time_interval(t);
@@ -548,7 +565,8 @@ int main(int argc, char** argv)
     // Recover messages
     gettimeofday(&t[1], NULL);
     for (j = 0; j < batch_size; j++) {
-      srslte_ldpc_decoder_decode_rm_c(&decoder_avx, symbols_c + j * finalN, messages_sim_avx + j * finalK, finalN);
+      srslte_ldpc_decoder_decode_rm_c(
+          &decoder_avx, symbols_c + j * finalN, messages_sim_avx + j * finalK, n_useful_symbols_dec);
     }
     gettimeofday(&t[2], NULL);
     get_time_interval(t);
@@ -570,7 +588,7 @@ int main(int argc, char** argv)
     gettimeofday(&t[1], NULL);
     for (j = 0; j < batch_size; j++) {
       srslte_ldpc_decoder_decode_rm_c(
-          &decoder_avx_flood, symbols_c + j * finalN, messages_sim_avx_flood + j * finalK, finalN);
+          &decoder_avx_flood, symbols_c + j * finalN, messages_sim_avx_flood + j * finalK, n_useful_symbols_dec);
     }
     gettimeofday(&t[2], NULL);
     get_time_interval(t);