Added AVX512 LDPC

master
Xavier Arteaga 4 years ago committed by Xavier Arteaga
parent 2ed8dceddf
commit d9805f7ba7

@ -36,6 +36,9 @@ typedef enum {
SRSLTE_LDPC_DECODER_C_AVX2, /*!< \brief %Decoder working with 8-bit integer-valued LLRs (AVX2 version). */
SRSLTE_LDPC_DECODER_C_AVX2_FLOOD, /*!< \brief %Decoder working with 8-bit integer-valued LLRs, flooded scheduling
(AVX2 version). */
SRSLTE_LDPC_DECODER_C_AVX512, /*!< \brief %Decoder working with 8-bit integer-valued LLRs (AVX512 version). */
SRSLTE_LDPC_DECODER_C_AVX512_FLOOD, /*!< \brief %Decoder working with 8-bit integer-valued LLRs, flooded scheduling
(AVX512 version). */
} srslte_ldpc_decoder_type_t;
/*!

@ -13,7 +13,7 @@
/*!
* \file ldpc_encoder.h
* \brief Declaration of the LDPC encoder.
* \author David Gregoratti
* \author David Gregoratti and Jesus Gomez
* \date 2020
*
* \copyright Software Radio Systems Limited
@ -33,6 +33,9 @@ typedef enum SRSLTE_API {
#if LV_HAVE_AVX2
SRSLTE_LDPC_ENCODER_AVX2, /*!< \brief SIMD-optimized encoder. */
#endif // LV_HAVE_AVX2
#if LV_HAVE_AVX512
SRSLTE_LDPC_ENCODER_AVX512, /*!< \brief SIMD-optimized encoder. */
#endif // LV_HAVE_AVX512
} srslte_ldpc_encoder_type_t;
/*!
@ -56,6 +59,8 @@ typedef struct SRSLTE_API {
void (*encode_high_rate)(void*, uint8_t*);
/*! \brief Pointer to the encoder for the high-rate region (SIMD-optimized version). */
void (*encode_high_rate_avx2)(void*);
/*! \brief Pointer to the encoder for the high-rate region (SIMD-AVX512-optimized version). */
void (*encode_high_rate_avx512)(void*);
} srslte_ldpc_encoder_t;

@ -17,7 +17,17 @@ if (HAVE_AVX2)
)
endif (HAVE_AVX2)
set(FEC_SOURCES ${FEC_SOURCES} ${AVX2_SOURCES}
if (HAVE_AVX512)
set(AVX512_SOURCES
ldpc/ldpc_dec_c_avx512.c
ldpc/ldpc_dec_c_avx512long.c
ldpc/ldpc_dec_c_avx512long_flood.c
ldpc/ldpc_enc_avx512.c
ldpc/ldpc_enc_avx512long.c
)
endif (HAVE_AVX512)
set(FEC_SOURCES ${FEC_SOURCES} ${AVX2_SOURCES} ${AVX512_SOURCES}
ldpc/base_graph.c
ldpc/ldpc_dec_f.c
ldpc/ldpc_dec_s.c

@ -0,0 +1,106 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2020 Software Radio Systems Limited
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
*
*/
/*!
* \file ldpc_avx512_consts.h
* \brief Declaration of constants and masks for the AVX512-based implementation
* of the LDPC encoder and decoder.
*
* \author Jesus Gomez
* \date 2020
*
* \copyright Software Radio Systems Limited
*
*/
#ifndef LDPC_AVX512_CONSTS_H
#define LDPC_AVX512_CONSTS_H
#include <immintrin.h>
#include "../utils_avx512.h"
/*!
* \brief Packed 8-bit zeros.
*/
static const __m512i _mm512_zero_epi8 = {0, 0, 0, 0, 0, 0, 0, 0}; // VECTOR OF 8 0's, is each occupying 64 bits
/*!
* \brief Packed 8-bit ones. // there are 16 x 4 bits = 64 for a LongLong (LL)
*/
static const __m512i _mm512_one_epi8 = {0x0101010101010101LL,
0x0101010101010101LL,
0x0101010101010101LL,
0x0101010101010101LL,
0x0101010101010101LL,
0x0101010101010101LL,
0x0101010101010101LL,
0x0101010101010101LL};
/*!
* \brief Packed 8-bit 127 (that is \f$2^7 - 1\f$, or 0111 1111).
*/
static const __m512i _mm512_infty8_epi8 = {0x7F7F7F7F7F7F7F7FLL,
0x7F7F7F7F7F7F7F7FLL,
0x7F7F7F7F7F7F7F7FLL,
0x7F7F7F7F7F7F7F7FLL,
0x7F7F7F7F7F7F7F7FLL,
0x7F7F7F7F7F7F7F7FLL,
0x7F7F7F7F7F7F7F7FLL,
0x7F7F7F7F7F7F7F7FLL};
/*!
* \brief Packed 8-bit --127 (that is \f$-2^7 + 1\f$, i.e. 1000 0001).
*/
static const __m512i _mm512_neg_infty8_epi8 = {0x8181818181818181LL, // NOLINT
0x8181818181818181LL, // NOLINT
0x8181818181818181LL, // NOLINT
0x8181818181818181LL, // NOLINT
0x8181818181818181LL, // NOLINT
0x8181818181818181LL, // NOLINT
0x8181818181818181LL, // NOLINT
0x8181818181818181LL}; // NOLINT
/*!
* \brief Packed 8-bit 63 (that is \f$2^6 - 1\f$).
*/
static const __m512i _mm512_infty7_epi8 = {0x3F3F3F3F3F3F3F3FLL,
0x3F3F3F3F3F3F3F3FLL,
0x3F3F3F3F3F3F3F3FLL,
0x3F3F3F3F3F3F3F3FLL,
0x3F3F3F3F3F3F3F3FLL,
0x3F3F3F3F3F3F3F3FLL,
0x3F3F3F3F3F3F3F3FLL,
0x3F3F3F3F3F3F3F3FLL};
/*!
* \brief Packed 8-bit --63 (that is \f$-2^6 + 1\f$).
*/
static const __m512i _mm512_neg_infty7_epi8 = {0xC1C1C1C1C1C1C1C1LL, // NOLINT
0xC1C1C1C1C1C1C1C1LL, // NOLINT
0xC1C1C1C1C1C1C1C1LL, // NOLINT
0xC1C1C1C1C1C1C1C1LL, // NOLINT
0xC1C1C1C1C1C1C1C1LL, // NOLINT
0xC1C1C1C1C1C1C1C1LL, // NOLINT
0xC1C1C1C1C1C1C1C1LL, // NOLINT
0xC1C1C1C1C1C1C1C1LL}; // NOLINT
/*!
* \brief Identifies even-indexed 8-bit packets.
*/
static const __m512i _mm512_mask_even_epi8 = {0x00FF00FF00FF00FF,
0x00FF00FF00FF00FF,
0x00FF00FF00FF00FF,
0x00FF00FF00FF00FF,
0x00FF00FF00FF00FF,
0x00FF00FF00FF00FF,
0x00FF00FF00FF00FF,
0x00FF00FF00FF00FF}; // NOLINT
#endif

@ -592,4 +592,216 @@ int update_ldpc_soft_bits_c_avx2long_flood(void* p, const int8_t (*these_var_ind
*/
int extract_ldpc_message_c_avx2long_flood(void* p, uint8_t* message, uint16_t liftK);
/*!
* Creates the registers used by the optimized 8-bit-based implementation of the LDPC decoder (LS > \ref
* SRSLTE_AVX512_B_SIZE). \param[in] bgN Codeword length. \param[in] bgM Number of check nodes.
* \param[in] ls Lifting size. \param[in] scaling_fctr Scaling factor of the normalized min-sum algorithm.
* \return A pointer to the created registers (an ldpc_regs_c_avx512long structure).
*/
void* create_ldpc_dec_c_avx512long(uint8_t bgN, uint8_t bgM, uint16_t ls, float scaling_fctr);
/*!
* Destroys the inner registers of the optimized 8-bit integer-based LDPC decoder (LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in] p A pointer to the dismantled decoder registers (an ldpc_regs_c_avx512long structure).
*/
void delete_ldpc_dec_c_avx512long(void* p);
/*!
* Initializes the inner registers of the optimized 8-bit integer-based LDPC decoder before
* carrying out the actual decoding (LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512long structure).
* \param[in] llrs A pointer to the array of LLR values from the channel.
* \param[in] ls The lifting size.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int init_ldpc_dec_c_avx512long(void* p, const int8_t* llrs, uint16_t ls);
/*!
* Updates the messages from variable nodes to check nodes (optimized 8-bit version, LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512long structure).
* \param[in] i_layer The index of the variable-to-check layer to update.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int update_ldpc_var_to_check_c_avx512long(void* p, int i_layer);
/*!
* Updates the messages from check nodes to variable nodes (optimized 8-bit version, LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512long structure).
* \param[in] i_layer The index of the variable-to-check layer to update.
* \param[in] this_pcm A pointer to the row of the parity check matrix (i.e. base
* graph) corresponding to the selected layer.
* \param[in] these_var_indices
* Contains the indices of the variable nodes connected
* to the current layer.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int update_ldpc_check_to_var_c_avx512long(void* p,
int i_layer,
const uint16_t* this_pcm,
const int8_t (*these_var_indices)[MAX_CNCT]);
/*!
* Updates the current estimate of the (soft) bits of the codeword (optimized 8-bit version, LS > \ref
* SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512long structure).
* \param[in] i_layer The index of the variable-to-check layer to update.
* \param[in] these_var_indices Contains the indices of the variable nodes connected to the current layer.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int update_ldpc_soft_bits_c_avx512long(void* p, int i_layer, const int8_t (*these_var_indices)[MAX_CNCT]);
/*!
* Returns the decoded message (hard bits) from the current soft bits (optimized 8-bit version, LS > \ref
* SRSLTE_AVX512_B_SIZE).
* \param[in] p A pointer to the decoder registers (an ldpc_regs_c_avx512long structure).
* \param[out] message A pointer to the decoded message.
* \param[in] liftK The length of the decoded message.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int extract_ldpc_message_c_avx512long(void* p, uint8_t* message, uint16_t liftK);
/*!
* Creates the registers used by the optimized 8-bit-based implementation of the LDPC decoder (LS <= \ref
* SRSLTE_AVX512_B_SIZE).
* \param[in] bgN Codeword length. \param[in] bgM Number of check nodes.
* \param[in] ls Lifting size. \param[in] scaling_fctr Scaling factor of the normalized min-sum algorithm.
* \return A pointer to the created registers (an ldpc_regs_c_avx512 structure).
*/
void* create_ldpc_dec_c_avx512(uint8_t bgN, uint8_t bgM, uint16_t ls, float scaling_fctr);
/*!
* Destroys the inner registers of the optimized 8-bit integer-based LDPC decoder (LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in] p A pointer to the dismantled decoder registers (an ldpc_regs_c_avx512 structure).
*/
void delete_ldpc_dec_c_avx512(void* p);
/*!
* Initializes the inner registers of the optimized 8-bit integer-based LDPC decoder before
* carrying out the actual decoding (LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512 structure).
* \param[in] llrs A pointer to the array of LLR values from the channel.
* \param[in] ls The lifting size.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int init_ldpc_dec_c_avx512(void* p, const int8_t* llrs, uint16_t ls);
/*!
* Updates the messages from variable nodes to check nodes (optimized 8-bit version, LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512 structure).
* \param[in] i_layer The index of the variable-to-check layer to update.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int update_ldpc_var_to_check_c_avx512(void* p, int i_layer);
/*!
* Updates the messages from check nodes to variable nodes (optimized 8-bit version, LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512 structure).
* \param[in] i_layer The index of the variable-to-check layer to update.
* \param[in] this_pcm A pointer to the row of the parity check matrix (i.e. base
* graph) corresponding to the selected layer.
* \param[in] these_var_indices
* Contains the indices of the variable nodes connected
* to the current layer.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int update_ldpc_check_to_var_c_avx512(void* p,
int i_layer,
const uint16_t* this_pcm,
const int8_t (*these_var_indices)[MAX_CNCT]);
/*!
* Updates the current estimate of the (soft) bits of the codeword (optimized 8-bit version, LS <= \ref
* SRSLTE_AVX512_B_SIZE). \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512 structure).
* \param[in] i_layer The index of the variable-to-check layer to update.
* \param[in] these_var_indices
* Contains the indices of the variable nodes connected
* to the current layer.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int update_ldpc_soft_bits_c_avx512(void* p, int i_layer, const int8_t (*these_var_indices)[MAX_CNCT]);
/*!
* Returns the decoded message (hard bits) from the current soft bits (optimized 8-bit version, LS <= \ref
* SRSLTE_AVX512_B_SIZE). \param[in] p A pointer to the decoder registers (an ldpc_regs_c_avx512 structure).
* \param[out] message A pointer to the decoded message.
* \param[in] liftK The length of the decoded message.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int extract_ldpc_message_c_avx512(void* p, uint8_t* message, uint16_t liftK);
/*!
* Creates the registers used by the optimized 8-bit-based implementation of the LDPC decoder
* (flooded scheduling, LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in] bgN Codeword length.
* \param[in] bgM Number of check nodes.
* \param[in] ls Lifting size.
* \param[in] scaling_fctr Scaling factor of the normalized min-sum algorithm.
* \return A pointer to the created registers (an ldpc_regs_c_avx512long_flood structure).
*/
void* create_ldpc_dec_c_avx512long_flood(uint8_t bgN, uint8_t bgM, uint16_t ls, float scaling_fctr);
/*!
* Destroys the inner registers of the optimized 8-bit integer-based LDPC decoder (flooded scheduling, LS > \ref
* SRSLTE_AVX512_B_SIZE). \param[in] p A pointer to the dismantled decoder registers (an ldpc_regs_c_avx512long_flood
* structure).
*/
void delete_ldpc_dec_c_avx512long_flood(void* p);
/*!
* Initializes the inner registers of the optimized 8-bit integer-based LDPC decoder before
* carrying out the actual decoding (flooded scheduling, LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512long_flood structure).
* \param[in] llrs A pointer to the array of LLR values from the channel.
* \param[in] ls The lifting size.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int init_ldpc_dec_c_avx512long_flood(void* p, const int8_t* llrs, uint16_t ls);
/*!
* Updates the messages from variable nodes to check nodes (optimized 8-bit version,
* flooded scheduling, LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512long_flood structure).
* \param[in] i_layer The index of the variable-to-check layer to update.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int update_ldpc_var_to_check_c_avx512long_flood(void* p, int i_layer);
/*!
* Updates the messages from check nodes to variable nodes (optimized 8-bit version,
* flooded scheduling, LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512long_flood structure).
* \param[in] i_layer The index of the variable-to-check layer to update.
* \param[in] this_pcm A pointer to the row of the parity check matrix (i.e. base
* graph) corresponding to the selected layer.
* \param[in] these_var_indices
* Contains the indices of the variable nodes connected
* to the current layer.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int update_ldpc_check_to_var_c_avx512long_flood(void* p,
int i_layer,
const uint16_t* this_pcm,
const int8_t (*these_var_indices)[MAX_CNCT]);
/*!
* Updates the current estimate of the (soft) bits of the codeword (optimized 8-bit version,
* flooded scheduling, LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] p A pointer to the decoder registers (an ldpc_regs_c_avx512long_flood structure).
* \param[in] these_var_indices
* Contains the indices of the variable nodes connected
* to each layer.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int update_ldpc_soft_bits_c_avx512long_flood(void* p, const int8_t (*these_var_indices)[MAX_CNCT]);
/*!
* Returns the decoded message (hard bits) from the current soft bits (optimized 8-bit version,
* flooded scheduling, LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in] p A pointer to the decoder registers (an ldpc_regs_c_avx512long_flood structure).
* \param[out] message A pointer to the decoded message.
* \param[in] liftK The length of the decoded message.
* \return An integer: 0 if the function executes correctly, -1 otherwise.
*/
int extract_ldpc_message_c_avx512long_flood(void* p, uint8_t* message, uint16_t liftK);
#endif // SRSLTE_LDPCDEC_ALL_H

@ -268,15 +268,17 @@ int update_ldpc_check_to_var_c_avx2(void* p,
__m256i* this_rotated_v2c = NULL;
__m256i this_abs_v2c_epi8;
__m256i minp_v2c_epi8 = _mm256_set1_epi8(INT8_MAX);
__m256i mins_v2c_epi8 = _mm256_set1_epi8(INT8_MAX);
__m256i prod_v2c_epi8 = _mm256_setzero_si256();
__m256i mask_sign_epi8;
__m256i mask_min_epi8;
__m256i help_min_epi8;
__m256i min_ix_epi8 = _mm256_setzero_si256();
__m256i current_ix_epi8;
__m256i minp_v2c_epi8 = _mm256_set1_epi8(INT8_MAX);
__m256i mins_v2c_epi8 = _mm256_set1_epi8(INT8_MAX);
__m256i prod_v2c_epi8 = _mm256_setzero_si256();
int8_t current_var_index = (*these_var_indices)[0];
for (i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {

@ -326,7 +326,9 @@ int update_ldpc_check_to_var_c_avx2long(void* p,
__m256i* this_rotated_v2c = NULL;
__m256i this_abs_v2c_epi8;
#ifndef IMPROVED
__m256i mask_sign_epi8;
#endif // IMPROVED
__m256i mask_min_epi8;
__m256i help_min_epi8;
__m256i current_ix_epi8;
@ -351,8 +353,12 @@ int update_ldpc_check_to_var_c_avx2long(void* p,
for (j = 0; j < vp->n_subnodes; j++) {
// mask_sign is 1 if this_v2c_epi8 is strictly negative
#ifndef IMPROVED
mask_sign_epi8 = _mm256_cmpgt_epi8(zero_epi8, this_rotated_v2c[j]);
vp->prod_v2c_epi8[j] = _mm256_xor_si256(vp->prod_v2c_epi8[j], mask_sign_epi8);
#else
vp->prod_v2c_epi8[j] = _mm256_xor_si256(vp->prod_v2c_epi8[j], this_rotated_v2c[j]);
#endif // IMPROVED
this_abs_v2c_epi8 = _mm256_abs_epi8(this_rotated_v2c[j]);
// mask_min is 1 if this_abs_v2c is strictly smaller tha minp_v2c
@ -385,8 +391,11 @@ int update_ldpc_check_to_var_c_avx2long(void* p,
for (j = 0; j < vp->n_subnodes; j++) {
// mask_sign is 1 if this_v2c_epi8 is strictly negative
#ifndef IMPROVED
final_sign_epi8 = _mm256_cmpgt_epi8(zero_epi8, this_rotated_v2c[j]);
final_sign_epi8 = _mm256_xor_si256(final_sign_epi8, vp->prod_v2c_epi8[j]);
#endif // IMPROVED
final_sign_epi8 = _mm256_xor_si256(this_rotated_v2c[j], vp->prod_v2c_epi8[j]);
current_ix_epi8 = _mm256_set1_epi8((int8_t)i);
mask_is_min_epi8 = _mm256_cmpeq_epi8(current_ix_epi8, vp->min_ix_epi8[j]);

@ -0,0 +1,463 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2020 Software Radio Systems Limited
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
*
*/
/*!
* \file ldpc_dec_c_avx512.c
* \brief Definition LDPC decoder inner functions working
* with 8-bit integer-valued LLRs (AVX512 version, lifting size < 64).
*
* Even if the inner representation is based on 8 bits, check-to-variable and
* variable-to-check messages are actually represented with 7 bits, the
* remaining bit is used to represent infinity.
*
* \author Jesus Gomez
* \date 2021
*
* \copyright Software Radio Systems Limited
*
*/
#include <stdint.h>
#include <stdlib.h>
#include <strings.h>
#include "../utils_avx512.h"
#include "ldpc_dec_all.h"
#include "srslte/phy/fec/ldpc/base_graph.h"
#include "srslte/phy/utils/vector.h"
#ifdef LV_HAVE_AVX512
#include <immintrin.h>
#include "ldpc_avx512_consts.h"
#define F2I 65535 /*!< \brief Used for float to int conversion---float f is stored as (int)(f*F2I). */
/*!
* \brief Maximum message magnitude.
* Messages use a 7-bit quantization. Soft bits use the remaining bit to denote infinity.
*/
static const int8_t infinity7 = (1U << 6U) - 1;
/*!
* \brief Represents a node of the base factor graph.
*/
typedef union bg_node_avx512_t {
int8_t c[SRSLTE_AVX512_B_SIZE]; /*!< Each base node may contain up to \ref SRSLTE_AVX512_B_SIZE lifted nodes. */
__m512i v; /*!< All the lifted nodes of the current base node as a 512-bit line. */
} bg_node_avx512_t;
/*!
* \brief Inner registers for the LDPC decoder that works with 8-bit integer-valued LLRs.
*/
struct ldpc_regs_c_avx512 {
__m512i scaling_fctr; /*!< \brief Scaling factor for the normalized min-sum decoding algorithm. */
bg_node_avx512_t* soft_bits; /*!< \brief A-posteriori log-likelihood ratios. */
__m512i* check_to_var; /*!< \brief Check-to-variable messages. */
__m512i* var_to_check; /*!< \brief Variable-to-check messages. */
__m512i* var_to_check_to_free; /*!< \brief the Variable-to-check messages with one extra _mm512 allocated space. */
__m512i* rotated_v2c; /*!< \brief To store a rotated version of the variable-to-check messages. */
__m512i* this_c2v_epi8; /*!< \brief Helper register for the current c2v node. */
__m512i* this_c2v_epi8_to_free; /*!< \brief Helper register for the current c2v node with one extra __m512 allocated
space. */
uint16_t ls; /*!< \brief Lifting size. */
uint8_t hrr; /*!< \brief Number of variable nodes in the high-rate region (before lifting). */
uint8_t bgM; /*!< \brief Number of check nodes (before lifting). */
uint8_t bgN; /*!< \brief Number of variable nodes (before lifting). */
uint16_t finalN; /*!< \brief (bgN-2)*ls */
};
/*!
* Carries out the actual update of the variable-to-check messages. It basically
* consists in \f$ z = x - y \f$ (as vectors). However, first it checks whether
* \f$\lvert x[i] \rvert = 2^{7}-1 \f$ (our representation of infinity) to
* ensure it is properly propagated. Also, the subtraction is saturated between
* \f$- clip\f$ and \f$+ clip\f$.
* \param[in] x Minuend: array we subtract from (in practice, the soft bits).
* \param[in] y Subtrahend: array to be subtracted (in practice, the
* check-to-variable messages).
* \param[out] z Resulting difference array(in practice, the updated
* variable-to-check messages).
* \param[in] clip The saturation value.
* \param[in] len The length of the vectors.
*/
static void inner_var_to_check_c_avx512(const __m512i* x, const __m512i* y, __m512i* z, uint8_t clip, uint32_t len);
/*!
* Rotate the contents of a node towards the right by \b shift chars, that is the
* \b shift * 8 most significant bits become the least significant ones.
* \param[in] mem_addr The node to rotate.
* \param[out] out The rotated node.
* \param[in] shift The order of the rotation in number of chars.
* \param[in] ls The size of the node (lifting size).
* \param[in] n_subnodes The number of subnodes in each node.
*/
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift, uint16_t ls);
/*!
* Scale packed 8-bit integers in \b a by the scaling factor \b sf / #F2I.
* \param[in] a Vector of packed 8-bit integers.
* \param[in] sf Scaling factor.
* \return Vector of packed 8-bit integers with the scaling result.
*/
static __m512i _mm512_scalei_epi8(__m512i a, __m512i sf);
void* create_ldpc_dec_c_avx512(uint8_t bgN, uint8_t bgM, uint16_t ls, float scaling_fctr)
{
struct ldpc_regs_c_avx512* vp = NULL;
uint8_t bgK = bgN - bgM;
uint16_t hrr = bgK + 4;
if ((vp = srslte_vec_malloc(sizeof(struct ldpc_regs_c_avx512))) == NULL) {
return NULL;
}
if ((vp->soft_bits = srslte_vec_malloc(bgN * sizeof(bg_node_avx512_t))) == NULL) {
free(vp);
return NULL;
}
if ((vp->check_to_var = srslte_vec_malloc((hrr + 1) * bgM * sizeof(__m512i))) == NULL) {
free(vp->soft_bits);
free(vp);
return NULL;
}
if ((vp->var_to_check_to_free = srslte_vec_malloc(((hrr + 1) + 2) * sizeof(__m512i))) == NULL) {
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
vp->var_to_check = &vp->var_to_check_to_free[1];
if ((vp->rotated_v2c = srslte_vec_malloc((hrr + 1) * sizeof(__m512i))) == NULL) {
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
if ((vp->this_c2v_epi8_to_free = srslte_vec_malloc((1 + 2) * sizeof(__m512i))) == NULL) {
free(vp->rotated_v2c);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
vp->this_c2v_epi8 =
&vp->this_c2v_epi8_to_free[1]; //+1 to support reading negative position in this_c2v_epi8 at rotate_node_rigth
vp->bgM = bgM;
vp->bgN = bgN;
vp->hrr = hrr;
vp->ls = ls;
vp->finalN = (bgN - 2) * ls;
// correction > 1/16 to compensate the scaling error (2^16-1)/2^16 incurred in _mm512_scalei_epi8
vp->scaling_fctr = _mm512_set1_epi16((uint16_t)((scaling_fctr + 0.00001525879) * F2I));
return vp;
}
void delete_ldpc_dec_c_avx512(void* p)
{
struct ldpc_regs_c_avx512* vp = p;
if (vp != NULL) {
free(vp->this_c2v_epi8_to_free);
free(vp->rotated_v2c);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
}
}
int init_ldpc_dec_c_avx512(void* p, const int8_t* llrs, uint16_t ls)
{
struct ldpc_regs_c_avx512* vp = p;
if (p == NULL) {
return -1;
}
int i = 0;
int k = 0;
// First 2 punctured bits
int ini = SRSLTE_AVX512_B_SIZE + SRSLTE_AVX512_B_SIZE;
bzero(vp->soft_bits->c, ini);
for (i = 0; i < vp->finalN; i = i + ls) {
for (k = 0; k < ls; k++) {
vp->soft_bits->c[ini + k] = llrs[i + k];
}
// this might be removed
bzero(&vp->soft_bits->c[ini + ls], (SRSLTE_AVX512_B_SIZE - ls) * sizeof(int8_t));
ini = ini + SRSLTE_AVX512_B_SIZE;
}
bzero(vp->check_to_var, (vp->hrr + 1) * vp->bgM * sizeof(__m512i));
bzero(vp->var_to_check, (vp->hrr + 1) * sizeof(__m512i));
return 0;
}
int extract_ldpc_message_c_avx512(void* p, uint8_t* message, uint16_t liftK)
{
if (p == NULL) {
return -1;
}
struct ldpc_regs_c_avx512* vp = p;
int ini = 0;
for (int i = 0; i < liftK; i = i + vp->ls) {
for (int k = 0; k < vp->ls; k++) {
message[i + k] = (vp->soft_bits->c[ini + k] < 0);
}
ini = ini + SRSLTE_AVX512_B_SIZE;
}
return 0;
}
int update_ldpc_var_to_check_c_avx512(void* p, int i_layer)
{
struct ldpc_regs_c_avx512* vp = p;
if (p == NULL) {
return -1;
}
__m512i* this_check_to_var = vp->check_to_var + i_layer * (vp->hrr + 1);
// Update the high-rate region.
inner_var_to_check_c_avx512(&(vp->soft_bits[0].v), this_check_to_var, vp->var_to_check, infinity7, vp->hrr);
if (i_layer >= 4) {
// Update the extension region.
inner_var_to_check_c_avx512(&(vp->soft_bits[0].v) + vp->hrr + i_layer - 4,
this_check_to_var + vp->hrr,
vp->var_to_check + vp->hrr,
infinity7,
1);
}
return 0;
}
int update_ldpc_check_to_var_c_avx512(void* p,
int i_layer,
const uint16_t* this_pcm,
const int8_t (*these_var_indices)[MAX_CNCT])
{
struct ldpc_regs_c_avx512* vp = p;
if (p == NULL) {
return -1;
}
int i = 0;
uint16_t shift = 0;
int i_v2c_base = 0;
__m512i* this_rotated_v2c = NULL;
__m512i this_abs_v2c_epi8;
__mmask64 mask_min_epi8;
__m512i help_min_epi8;
__m512i min_ix_epi8 = _mm512_setzero_si512();
__m512i current_ix_epi8;
__m512i minp_v2c_epi8 = _mm512_set1_epi8(INT8_MAX);
__m512i mins_v2c_epi8 = _mm512_set1_epi8(INT8_MAX);
__m512i prod_v2c_epi8 = _mm512_setzero_si512();
int8_t current_var_index = (*these_var_indices)[0];
for (i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {
shift = this_pcm[current_var_index];
i_v2c_base = (current_var_index <= vp->hrr) ? current_var_index : vp->hrr;
current_ix_epi8 = _mm512_set1_epi8((int8_t)i);
this_rotated_v2c = vp->rotated_v2c + i;
rotate_node_right((uint8_t*)(vp->var_to_check + i_v2c_base), this_rotated_v2c, shift, vp->ls);
prod_v2c_epi8 = _mm512_xor_si512(prod_v2c_epi8, *this_rotated_v2c);
this_abs_v2c_epi8 = _mm512_abs_epi8(*this_rotated_v2c);
// mask_min is 1 if this_abs_v2c is strictly smaller tha minp_v2c
mask_min_epi8 = _mm512_cmpgt_epi8_mask(minp_v2c_epi8, this_abs_v2c_epi8);
help_min_epi8 = _mm512_mask_blend_epi8(mask_min_epi8, this_abs_v2c_epi8, minp_v2c_epi8);
minp_v2c_epi8 = _mm512_mask_blend_epi8(mask_min_epi8, minp_v2c_epi8, this_abs_v2c_epi8);
min_ix_epi8 = _mm512_mask_blend_epi8(mask_min_epi8, min_ix_epi8, current_ix_epi8);
// mask_min is 1 if this_abs_v2c is strictly smaller tha mins_v2c
mask_min_epi8 = _mm512_cmpgt_epi8_mask(mins_v2c_epi8, this_abs_v2c_epi8);
mins_v2c_epi8 = _mm512_mask_blend_epi8(mask_min_epi8, mins_v2c_epi8, help_min_epi8);
current_var_index = (*these_var_indices)[i + 1];
}
__m512i* this_check_to_var = vp->check_to_var + i_layer * (vp->hrr + 1);
current_var_index = (*these_var_indices)[0];
__mmask64 mask_is_min_epi8;
__m512i* this_c2v_epi8 = vp->this_c2v_epi8;
__m512i final_sign_epi8;
for (i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {
shift = this_pcm[current_var_index];
i_v2c_base = (current_var_index <= vp->hrr) ? current_var_index : vp->hrr;
this_rotated_v2c = vp->rotated_v2c + i;
final_sign_epi8 = _mm512_xor_si512(*this_rotated_v2c, prod_v2c_epi8);
current_ix_epi8 = _mm512_set1_epi8((int8_t)i);
mask_is_min_epi8 = _mm512_cmpeq_epi8_mask(current_ix_epi8, min_ix_epi8);
this_c2v_epi8[0] = _mm512_mask_blend_epi8(mask_is_min_epi8, minp_v2c_epi8, mins_v2c_epi8);
this_c2v_epi8[0] = _mm512_scalei_epi8(this_c2v_epi8[0], vp->scaling_fctr);
// does *not* do anything special for signs[i] == 0, just negative / non-negative
__mmask64 negmask = _mm512_movepi8_mask(final_sign_epi8); // transform final_sing_epi8 into a mask
this_c2v_epi8[0] = _mm512_mask_sub_epi8(this_c2v_epi8[0], negmask, _mm512_setzero_si512(), this_c2v_epi8[0]);
// rotating right LS - shift positions is the same as rotating left shift positions
rotate_node_right((uint8_t*)vp->this_c2v_epi8, this_check_to_var + i_v2c_base, (vp->ls - shift) % vp->ls, vp->ls);
current_var_index = (*these_var_indices)[i + 1];
}
return 0;
}
int update_ldpc_soft_bits_c_avx512(void* p, int i_layer, const int8_t (*these_var_indices)[MAX_CNCT])
{
struct ldpc_regs_c_avx512* vp = p;
if (p == NULL) {
return -1;
}
__m512i* this_check_to_var = vp->check_to_var + i_layer * (vp->hrr + 1);
int i_bit_tmp_base = 0;
__m512i tmp_epi8;
__mmask64 mask_epi8;
int8_t current_var_index = (*these_var_indices)[0];
for (int i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {
i_bit_tmp_base = (current_var_index <= vp->hrr) ? current_var_index : vp->hrr;
tmp_epi8 = _mm512_adds_epi8(this_check_to_var[i_bit_tmp_base], vp->var_to_check[i_bit_tmp_base]);
mask_epi8 = _mm512_cmpgt_epi8_mask(tmp_epi8, _mm512_infty7_epi8);
tmp_epi8 = _mm512_mask_blend_epi8(mask_epi8, tmp_epi8, _mm512_infty8_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(_mm512_neg_infty7_epi8, tmp_epi8);
vp->soft_bits[current_var_index].v = _mm512_mask_blend_epi8(mask_epi8, tmp_epi8, _mm512_neg_infty8_epi8);
current_var_index = (*these_var_indices)[i + 1];
}
return 0;
}
static void
inner_var_to_check_c_avx512(const __m512i* x, const __m512i* y, __m512i* z, const uint8_t clip, const uint32_t len)
{
unsigned i = 0;
__m512i x_epi8;
__m512i y_epi8;
__m512i z_epi8;
__mmask64 mask_epi8;
__m512i help_sub_epi8;
__m512i clip_epi8 = _mm512_set1_epi8(clip);
__m512i neg_clip_epi8 = _mm512_set1_epi8((char)(-clip));
// len = number of subnodes of size __m512
for (i = 0; i < len; i++) {
x_epi8 = x[i];
y_epi8 = y[i];
help_sub_epi8 = _mm512_subs_epi8(x_epi8, y_epi8); // x-y
mask_epi8 = _mm512_cmpgt_epi8_mask(help_sub_epi8, clip_epi8); // saturate to clip insteaof inifinty8
z_epi8 = _mm512_mask_blend_epi8(mask_epi8, help_sub_epi8, clip_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(neg_clip_epi8, z_epi8);
z_epi8 = _mm512_mask_blend_epi8(mask_epi8, z_epi8, neg_clip_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(_mm512_infty8_epi8, x_epi8);
z_epi8 = _mm512_mask_blend_epi8(mask_epi8, _mm512_infty8_epi8, z_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(x_epi8, _mm512_neg_infty8_epi8);
z[i] = _mm512_mask_blend_epi8(mask_epi8, _mm512_neg_infty8_epi8, z_epi8);
}
}
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift, uint16_t ls)
{
const __m512i MZERO = _mm512_set1_epi8(0);
uint16_t shift = 0;
uint16_t _shift = 0;
uint64_t mask1 = 0;
uint64_t mask2 = 0;
if (this_shift == 0) {
out[0] = _mm512_loadu_si512(mem_addr);
} else { // if the last is broken, take _shift bits from the end and "shift" bits from the begin.
_shift = ls - this_shift;
shift = SRSLTE_AVX512_B_SIZE - _shift;
mask1 = (1ULL << _shift) - 1; // i.e. 000001111 _shift =4
mask2 = (1ULL << shift) - 1;
mask2 = mask2 << _shift; // i.e. 000110000 shift = 2, _shift = 4
out[0] = _mm512_mask_loadu_epi8(MZERO, mask1, mem_addr + this_shift);
out[0] = _mm512_mask_loadu_epi8(out[0], mask2, mem_addr - _shift);
}
}
static __m512i _mm512_scalei_epi8(__m512i a, __m512i sf)
{
__m512i even_epi16 = _mm512_and_si512(a, _mm512_mask_even_epi8);
__m512i odd_epi16 = _mm512_srli_epi16(a, 8);
__m512i p_even_epi16 = _mm512_mulhi_epu16(even_epi16, sf);
__m512i p_odd_epi16 = _mm512_mulhi_epu16(odd_epi16, sf);
p_odd_epi16 = _mm512_slli_epi16(p_odd_epi16, 8);
return _mm512_xor_si512(p_even_epi16, p_odd_epi16);
}
#endif // LV_HAVE_AVX2

@ -0,0 +1,557 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2020 Software Radio Systems Limited
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
*
*/
/*!
* \file ldpc_dec_c_avx512long.c
* \brief Definition LDPC decoder inner functions working
* with 8-bit integer-valued LLRs (AVX512 version, large lifting size).
*
* Even if the inner representation is based on 8 bits, check-to-variable and
* variable-to-check messages are actually represented with 7 bits, the
* remaining bit is used to represent infinity.
*
* \author Jesus Gomez
* \date 2021
*
* \copyright Software Radio Systems Limited
*
*/
#include <stdint.h>
#include <stdlib.h>
#include <strings.h>
#include "../utils_avx512.h"
#include "ldpc_dec_all.h"
#include "srslte/phy/fec/ldpc/base_graph.h"
#include "srslte/phy/utils/vector.h"
#ifdef LV_HAVE_AVX512
#include <immintrin.h>
#include "ldpc_avx512_consts.h"
#define F2I 65535 /*!< \brief Used for float to int conversion---float f is stored as (int)(f*F2I). */
/*!
* \brief Maximum message magnitude.
* Messages use a 7-bit quantization. Soft bits use the remaining bit to denote infinity.
*/
static const int8_t infinity7 = (1U << 6U) - 1;
/*!
* \brief Represents a node of the base factor graph.
*/
typedef union bg_node_avx512_t {
int8_t c[SRSLTE_AVX512_B_SIZE]; /*!< Each base node may contain up to \ref SRSLTE_AVX512_B_SIZE lifted nodes. */
__m512i v; /*!< All the lifted nodes of the current base node as a 512-bit line. */
} bg_node_avx512_t;
/*!
* \brief Inner registers for the LDPC decoder that works with 8-bit integer-valued LLRs.
*/
struct ldpc_regs_c_avx512long {
__m512i scaling_fctr; /*!< \brief Scaling factor for the normalized min-sum decoding algorithm. */
bg_node_avx512_t* soft_bits; /*!< \brief A-posteriori log-likelihood ratios. */
__m512i* check_to_var; /*!< \brief Check-to-variable messages. */
__m512i* var_to_check; /*!< \brief Variable-to-check messages. */
__m512i* var_to_check_to_free; /*!< \brief the Variable-to-check messages with one extra _mm512 allocated space. */
__m512i* rotated_v2c; /*!< \brief To store a rotated version of the variable-to-check messages. */
__m512i* this_c2v_epi8; /*!< \brief Helper register for the current c2v node. */
__m512i* this_c2v_epi8_to_free; /*!< \brief Helper register for the current c2v node with one extra __m512 allocated
space. */
__m512i* minp_v2c_epi8; /*!< \brief Helper register for the minimum v2c message. */
__m512i* mins_v2c_epi8; /*!< \brief Helper register for the second minimum v2c message. */
__m512i* prod_v2c_epi8; /*!< \brief Helper register for the sign of the product of all v2c messages. */
__m512i* min_ix_epi8; /*!< \brief Helper register for the index of the minimum v2c message. */
uint16_t ls; /*!< \brief Lifting size. */
uint8_t hrr; /*!< \brief Number of variable nodes in the high-rate region (before lifting). */
uint8_t bgM; /*!< \brief Number of check nodes (before lifting). */
uint8_t bgN; /*!< \brief Number of variable nodes (before lifting). */
uint16_t node_size; /*!< \brief Size of the node in bytes >ls */
uint16_t finalN; /*!< \brief (bgN-2)*ls */
uint8_t n_subnodes; /*!< \brief Number of subnodes. */
};
/*!
* Carries out the actual update of the variable-to-check messages. It basically
* consists in \f$ z = x - y \f$ (as vectors). However, first it checks whether
* \f$\lvert x[i] \rvert = 2^{7}-1 \f$ (our representation of infinity) to
* ensure it is properly propagated. Also, the subtraction is saturated between
* \f$- clip\f$ and \f$+ clip\f$.
* \param[in] x Minuend: array we subtract from (in practice, the soft bits).
* \param[in] y Subtrahend: array to be subtracted (in practice, the
* check-to-variable messages).
* \param[out] z Resulting difference array(in practice, the updated
* variable-to-check messages).
* \param[in] clip The saturation value.
* \param[in] len The length of the vectors.
*/
static void inner_var_to_check_c_avx512long(const __m512i* x, const __m512i* y, __m512i* z, uint8_t clip, uint32_t len);
/*!
* Rotate the contents of a node towards the right by \b shift chars, that is the
* \b shift * 8 most significant bits become the least significant ones.
* \param[in] mem_addr The node to rotate.
* \param[out] out The rotated node.
* \param[in] shift The order of the rotation in number of chars.
* \param[in] ls The size of the node (lifting size).
* \param[in] n_subnodes The number of subnodes in each node.
*/
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift, uint16_t ls);
/*!
* Scale packed 8-bit integers in \b a by the scaling factor \b sf / #F2I.
* \param[in] a Vector of packed 8-bit integers.
* \param[in] sf Scaling factor.
* \return Vector of packed 8-bit integers with the scaling result.
*/
static __m512i _mm512_scalei_epi8(__m512i a, __m512i sf);
void* create_ldpc_dec_c_avx512long(uint8_t bgN, uint8_t bgM, uint16_t ls, float scaling_fctr)
{
struct ldpc_regs_c_avx512long* vp = NULL;
uint8_t bgK = bgN - bgM;
uint16_t hrr = bgK + 4;
if ((vp = srslte_vec_malloc(sizeof(struct ldpc_regs_c_avx512long))) == NULL) {
return NULL;
}
// compute number of subnodes
int left_out = ls % SRSLTE_AVX512_B_SIZE;
int n_subnodes = ls / SRSLTE_AVX512_B_SIZE + (left_out > 0);
if ((vp->soft_bits = srslte_vec_malloc(bgN * n_subnodes * sizeof(bg_node_avx512_t))) == NULL) {
free(vp);
return NULL;
}
if ((vp->check_to_var = srslte_vec_malloc((hrr + 1) * bgM * n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->soft_bits);
free(vp);
return NULL;
}
if ((vp->var_to_check_to_free = srslte_vec_malloc(((hrr + 1) * n_subnodes + 2) * sizeof(__m512i))) == NULL) {
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
vp->var_to_check = &vp->var_to_check_to_free[1];
if ((vp->minp_v2c_epi8 = srslte_vec_malloc(n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
if ((vp->mins_v2c_epi8 = srslte_vec_malloc(n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
if ((vp->prod_v2c_epi8 = srslte_vec_malloc(n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
if ((vp->min_ix_epi8 = srslte_vec_malloc(n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->prod_v2c_epi8);
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
if ((vp->rotated_v2c = srslte_vec_malloc((hrr + 1) * n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->min_ix_epi8);
free(vp->prod_v2c_epi8);
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
if ((vp->this_c2v_epi8_to_free = srslte_vec_malloc((n_subnodes + 2) * sizeof(__m512i))) == NULL) {
free(vp->rotated_v2c);
free(vp->min_ix_epi8);
free(vp->prod_v2c_epi8);
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
return NULL;
}
vp->this_c2v_epi8 =
&vp->this_c2v_epi8_to_free[1]; //+1 to support reading negative position in this_c2v_epi8 at rotate_node_rigth
vp->bgM = bgM;
vp->bgN = bgN;
vp->hrr = hrr;
vp->ls = ls;
vp->n_subnodes = n_subnodes;
vp->node_size = SRSLTE_AVX512_B_SIZE * vp->n_subnodes;
vp->finalN = (bgN - 2) * ls;
// correction > 1/16 to compensate the scaling error (2^16-1)/2^16 incurred in _mm512_scalei_epi8
vp->scaling_fctr = _mm512_set1_epi16((uint16_t)((scaling_fctr + 0.00001525879) * F2I));
return vp;
}
void delete_ldpc_dec_c_avx512long(void* p)
{
struct ldpc_regs_c_avx512long* vp = p;
if (vp != NULL) {
free(vp->this_c2v_epi8_to_free);
free(vp->rotated_v2c);
free(vp->min_ix_epi8);
free(vp->prod_v2c_epi8);
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp);
}
}
int init_ldpc_dec_c_avx512long(void* p, const int8_t* llrs, uint16_t ls)
{
struct ldpc_regs_c_avx512long* vp = p;
int i = 0;
int k = 0;
if (p == NULL) {
return -1;
}
// First 2 punctured bits
int node_size = vp->node_size;
int ini = node_size + node_size;
bzero(vp->soft_bits->c, ini);
for (i = 0; i < vp->finalN; i = i + ls) {
for (k = 0; k < ls; k++) {
vp->soft_bits->c[ini + k] = llrs[i + k];
}
// this zero padding might be removed
bzero(&vp->soft_bits->c[ini + ls], (node_size - ls) * sizeof(int8_t));
ini = ini + node_size;
}
bzero(vp->check_to_var, (vp->hrr + 1) * vp->bgM * vp->n_subnodes * sizeof(__m512i));
bzero(vp->var_to_check, (vp->hrr + 1) * vp->n_subnodes * sizeof(__m512i));
return 0;
}
int extract_ldpc_message_c_avx512long(void* p, uint8_t* message, uint16_t liftK)
{
if (p == NULL) {
return -1;
}
struct ldpc_regs_c_avx512long* vp = p;
int ini = 0;
for (int i = 0; i < liftK; i = i + vp->ls) {
for (int k = 0; k < vp->ls; k++) {
message[i + k] = (vp->soft_bits->c[ini + k] < 0);
}
ini = ini + vp->node_size;
}
return 0;
}
int update_ldpc_var_to_check_c_avx512long(void* p, int i_layer)
{
struct ldpc_regs_c_avx512long* vp = p;
if (p == NULL) {
return -1;
}
__m512i* this_check_to_var = vp->check_to_var + i_layer * (vp->hrr + 1) * vp->n_subnodes;
// Update the high-rate region.
inner_var_to_check_c_avx512long(
&(vp->soft_bits[0].v), this_check_to_var, vp->var_to_check, infinity7, vp->hrr * vp->n_subnodes);
if (i_layer >= 4) {
// Update the extension region.
inner_var_to_check_c_avx512long(&(vp->soft_bits[0].v) + (vp->hrr + i_layer - 4) * vp->n_subnodes,
this_check_to_var + vp->hrr * vp->n_subnodes,
vp->var_to_check + vp->hrr * vp->n_subnodes,
infinity7,
vp->n_subnodes);
}
return 0;
}
int update_ldpc_check_to_var_c_avx512long(void* p,
int i_layer,
const uint16_t* this_pcm,
const int8_t (*these_var_indices)[MAX_CNCT])
{
struct ldpc_regs_c_avx512long* vp = p;
if (p == NULL) {
return -1;
}
int i = 0;
int j = 0;
uint16_t shift = 0;
int i_v2c_base = 0;
__m512i* this_rotated_v2c = NULL;
__m512i this_abs_v2c_epi8;
__mmask64 mask_min_epi8;
__m512i help_min_epi8;
__m512i current_ix_epi8;
for (j = 0; j < vp->n_subnodes; j++) {
vp->minp_v2c_epi8[j] = _mm512_set1_epi8(INT8_MAX);
vp->mins_v2c_epi8[j] = _mm512_set1_epi8(INT8_MAX);
vp->prod_v2c_epi8[j] = _mm512_set1_epi8(0);
}
int8_t current_var_index = (*these_var_indices)[0];
for (i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {
shift = this_pcm[current_var_index];
i_v2c_base = (current_var_index <= vp->hrr) ? current_var_index : vp->hrr;
i_v2c_base *= vp->n_subnodes;
current_ix_epi8 = _mm512_set1_epi8((int8_t)i);
this_rotated_v2c = vp->rotated_v2c + i * vp->n_subnodes;
rotate_node_right((uint8_t*)(vp->var_to_check + i_v2c_base), this_rotated_v2c, shift, vp->ls);
for (j = 0; j < vp->n_subnodes; j++) {
vp->prod_v2c_epi8[j] = _mm512_xor_si512(vp->prod_v2c_epi8[j], this_rotated_v2c[j]);
this_abs_v2c_epi8 = _mm512_abs_epi8(this_rotated_v2c[j]);
// mask_min is 1 if this_abs_v2c is strictly smaller tha minp_v2c
mask_min_epi8 = _mm512_cmpgt_epi8_mask(vp->minp_v2c_epi8[j], this_abs_v2c_epi8);
help_min_epi8 = _mm512_mask_blend_epi8(mask_min_epi8, this_abs_v2c_epi8, vp->minp_v2c_epi8[j]);
vp->minp_v2c_epi8[j] = _mm512_mask_blend_epi8(mask_min_epi8, vp->minp_v2c_epi8[j], this_abs_v2c_epi8);
vp->min_ix_epi8[j] = _mm512_mask_blend_epi8(mask_min_epi8, vp->min_ix_epi8[j], current_ix_epi8);
// mask_min is 1 if this_abs_v2c is strictly smaller tha mins_v2c
mask_min_epi8 = _mm512_cmpgt_epi8_mask(vp->mins_v2c_epi8[j], this_abs_v2c_epi8);
vp->mins_v2c_epi8[j] = _mm512_mask_blend_epi8(mask_min_epi8, vp->mins_v2c_epi8[j], help_min_epi8);
}
current_var_index = (*these_var_indices)[i + 1];
}
__m512i* this_check_to_var = vp->check_to_var + i_layer * (vp->hrr + 1) * vp->n_subnodes;
current_var_index = (*these_var_indices)[0];
__mmask64 mask_is_min_epi8;
__m512i final_sign_epi8;
for (i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {
shift = this_pcm[current_var_index];
i_v2c_base = (current_var_index <= vp->hrr) ? current_var_index : vp->hrr;
i_v2c_base *= vp->n_subnodes;
this_rotated_v2c = vp->rotated_v2c + i * vp->n_subnodes;
for (j = 0; j < vp->n_subnodes; j++) {
final_sign_epi8 = _mm512_xor_si512(this_rotated_v2c[j], vp->prod_v2c_epi8[j]);
current_ix_epi8 = _mm512_set1_epi8((int8_t)i);
mask_is_min_epi8 = _mm512_cmpeq_epi8_mask(current_ix_epi8, vp->min_ix_epi8[j]);
vp->this_c2v_epi8[j] = _mm512_mask_blend_epi8(mask_is_min_epi8, vp->minp_v2c_epi8[j], vp->mins_v2c_epi8[j]);
vp->this_c2v_epi8[j] = _mm512_scalei_epi8(vp->this_c2v_epi8[j], vp->scaling_fctr);
// does *not* do anything special for signs[i] == 0, just negative / non-negative
__mmask64 negmask = _mm512_movepi8_mask(final_sign_epi8); // transform final_sing_epi8 into a mask
vp->this_c2v_epi8[j] =
_mm512_mask_sub_epi8(vp->this_c2v_epi8[j], negmask, _mm512_setzero_si512(), vp->this_c2v_epi8[j]);
}
// rotating right LS - shift positions is the same as rotating left shift positions
rotate_node_right((uint8_t*)vp->this_c2v_epi8, this_check_to_var + i_v2c_base, (vp->ls - shift) % vp->ls, vp->ls);
current_var_index = (*these_var_indices)[i + 1];
}
return 0;
}
int update_ldpc_soft_bits_c_avx512long(void* p, int i_layer, const int8_t (*these_var_indices)[MAX_CNCT])
{
struct ldpc_regs_c_avx512long* vp = p;
if (p == NULL) {
return -1;
}
int j = 0;
__m512i* this_check_to_var = vp->check_to_var + i_layer * (vp->hrr + 1) * vp->n_subnodes;
int i_bit_tmp_base = 0;
int i_bit_subnode = 0;
__m512i tmp_epi8;
__mmask64 mask_epi8;
int8_t current_var_index = (*these_var_indices)[0];
int current_var_index_subnode = 0;
for (int i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {
current_var_index_subnode = current_var_index * vp->n_subnodes;
for (j = 0; j < vp->n_subnodes; j++) {
i_bit_tmp_base = (current_var_index <= vp->hrr) ? current_var_index : vp->hrr;
i_bit_subnode = i_bit_tmp_base * vp->n_subnodes + j;
tmp_epi8 = _mm512_adds_epi8(this_check_to_var[i_bit_subnode], vp->var_to_check[i_bit_subnode]);
mask_epi8 = _mm512_cmpgt_epi8_mask(tmp_epi8, _mm512_infty7_epi8);
tmp_epi8 = _mm512_mask_blend_epi8(mask_epi8, tmp_epi8, _mm512_infty8_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(_mm512_neg_infty7_epi8, tmp_epi8);
vp->soft_bits[current_var_index_subnode + j].v =
_mm512_mask_blend_epi8(mask_epi8, tmp_epi8, _mm512_neg_infty8_epi8);
}
current_var_index = (*these_var_indices)[i + 1];
}
return 0;
}
static void
inner_var_to_check_c_avx512long(const __m512i* x, const __m512i* y, __m512i* z, const uint8_t clip, const uint32_t len)
{
unsigned i = 0;
__m512i x_epi8;
__m512i y_epi8;
__m512i z_epi8;
__mmask64 mask_epi8;
__m512i help_sub_epi8;
__m512i clip_epi8 = _mm512_set1_epi8(clip);
__m512i neg_clip_epi8 = _mm512_set1_epi8((char)(-clip));
for (i = 0; i < len; i++) {
x_epi8 = x[i];
y_epi8 = y[i];
help_sub_epi8 = _mm512_subs_epi8(x_epi8, y_epi8); // x-y
mask_epi8 = _mm512_cmpgt_epi8_mask(help_sub_epi8, clip_epi8); // saturate to clip insteaof inifinty8
z_epi8 = _mm512_mask_blend_epi8(mask_epi8, help_sub_epi8, clip_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(neg_clip_epi8, z_epi8);
z_epi8 = _mm512_mask_blend_epi8(mask_epi8, z_epi8, neg_clip_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(_mm512_infty8_epi8, x_epi8);
z_epi8 = _mm512_mask_blend_epi8(mask_epi8, _mm512_infty8_epi8, z_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(x_epi8, _mm512_neg_infty8_epi8);
z[i] = _mm512_mask_blend_epi8(mask_epi8, _mm512_neg_infty8_epi8, z_epi8);
}
}
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift, uint16_t ls)
{
uint16_t shift = 0;
uint16_t _shift = 0;
uint64_t mask1 = 0;
uint64_t mask2 = 0;
const __m512i MZERO = _mm512_set1_epi8(0);
// the part in the middle - we simply copy.
int j = 0;
int jj = 0;
// copy full avx512 registers from this_shift_2
for (j = this_shift; j <= ls - SRSLTE_AVX512_B_SIZE; j = j + SRSLTE_AVX512_B_SIZE) {
out[jj] = _mm512_loadu_si512(mem_addr + j);
jj = jj + 1;
}
// if the last is broken, take _shift bits from the end and "shift" bits from the begin.
if (ls > j) {
_shift = ls - j;
shift = SRSLTE_AVX512_B_SIZE - _shift;
mask1 = (1ULL << _shift) - 1; // i.e. 000001111 _shift =4
mask2 = (1ULL << shift) - 1;
mask2 = mask2 << _shift; // i.e. 000110000 shift = 2, _shift = 4
out[jj] = _mm512_mask_loadu_epi8(MZERO, mask1, mem_addr + j);
out[jj] = _mm512_mask_loadu_epi8(out[jj], mask2, mem_addr - _shift);
jj = jj + 1;
}
// copy full avx512 registers from the start of mem_addr
for (j = shift; j < this_shift; j = j + SRSLTE_AVX512_B_SIZE) {
out[jj] = _mm512_loadu_si512(mem_addr + j); // the excess is filled with something arbitrary
jj = jj + 1;
}
}
static __m512i _mm512_scalei_epi8(__m512i a, __m512i sf)
{
__m512i even_epi16 = _mm512_and_si512(a, _mm512_mask_even_epi8);
__m512i odd_epi16 = _mm512_srli_epi16(a, 8);
__m512i p_even_epi16 = _mm512_mulhi_epu16(even_epi16, sf);
__m512i p_odd_epi16 = _mm512_mulhi_epu16(odd_epi16, sf);
p_odd_epi16 = _mm512_slli_epi16(p_odd_epi16, 8);
return _mm512_xor_si512(p_even_epi16, p_odd_epi16);
}
#endif // LV_HAVE_AVX512

@ -0,0 +1,591 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2020 Software Radio Systems Limited
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
*
*/
/*!
* \file ldpc_dec_c_avx512long_flood.c
* \brief Definition LDPC decoder inner functions working
* with 8-bit integer-valued LLRs (flooded scheduling, AVX512 version, large lifting size).
*
* Even if the inner representation is based on 8 bits, check-to-variable and
* variable-to-check messages are actually represented with 7 bits, the
* remaining bit is used to represent infinity.
*
* \author Jesus Gomez
* \date 2021
*
* \copyright Software Radio Systems Limited
*
*/
#include <stdint.h>
#include <stdlib.h>
#include <strings.h>
#include "../utils_avx512.h"
#include "ldpc_dec_all.h"
#include "srslte/phy/fec/ldpc/base_graph.h"
#include "srslte/phy/utils/vector.h"
#ifdef LV_HAVE_AVX512
#include <immintrin.h>
#include "ldpc_avx512_consts.h"
#define F2I 65535 /*!< \brief Used for float to int conversion---float f is stored as (int)(f*F2I). */
/*!
* \brief Maximum message magnitude.
* Messages use a 7-bit quantization. Soft bits use the remaining bit to denote infinity.
*/
static const int8_t infinity7 = (1U << 6U) - 1;
/*!
* \brief Represents a node of the base factor graph.
*/
typedef union bg_node_avx512_t {
int8_t c[SRSLTE_AVX512_B_SIZE]; /*!< Each base node may contain up to \ref SRSLTE_AVX512_B_SIZE lifted nodes. */
__m512i v; /*!< All the lifted nodes of the current base node as a 512-bit line. */
} bg_node_avx512_t;
/*!
* \brief Inner registers for the LDPC decoder that works with 8-bit integer-valued LLRs.
*/
struct ldpc_regs_c_avx512long_flood {
__m512i scaling_fctr; /*!< \brief Scaling factor for the normalized min-sum decoding algorithm. */
bg_node_avx512_t* soft_bits; /*!< \brief A-posteriori log-likelihood ratios. */
__m512i* llrs; /*!< \brief A-priori log-likelihood ratios. */
__m512i* check_to_var; /*!< \brief Check-to-variable messages. */
__m512i* var_to_check; /*!< \brief Variable-to-check messages. */
__m512i* var_to_check_to_free; /*!< \brief Auxiliar variable-to-check messages, with 2 extra __m512 space. */
__m512i* rotated_v2c; /*!< \brief To store a rotated version of the variable-to-check messages. */
__m512i* this_c2v_epi8; /*!< \brief Helper register for the current c2v node. */
__m512i*
this_c2v_epi8_to_free; /*!< \brief Auxiliar helper register for the current c2v node, with 2 extra _mm512 space */
__m512i* minp_v2c_epi8; /*!< \brief Helper register for the minimum v2c message. */
__m512i* mins_v2c_epi8; /*!< \brief Helper register for the second minimum v2c message. */
__m512i* prod_v2c_epi8; /*!< \brief Helper register for the sign of the product of all v2c messages. */
__m512i* min_ix_epi8; /*!< \brief Helper register for the index of the minimum v2c message. */
uint16_t ls; /*!< \brief Lifting size. */
uint8_t n_subnodes; /*!< \brief Number of subnodes. */
uint8_t hrr; /*!< \brief Number of variable nodes in the high-rate region (before lifting). */
uint8_t bgM; /*!< \brief Number of check nodes (before lifting). */
uint8_t bgN; /*!< \brief Number of variable nodes (before lifting). */
};
/*!
* Carries out the actual update of the variable-to-check messages. It basically
* consists in \f$ z = x - y \f$ (as vectors). However, first it checks whether
* \f$\lvert x[i] \rvert = 2^{7}-1 \f$ (our representation of infinity) to
* ensure it is properly propagated. Also, the subtraction is saturated between
* \f$- clip\f$ and \f$+ clip\f$.
* \param[in] x Minuend: array we subtract from (in practice, the soft bits).
* \param[in] y Subtrahend: array to be subtracted (in practice, the
* check-to-variable messages).
* \param[out] z Resulting difference array(in practice, the updated
* variable-to-check messages).
* \param[in] clip The saturation value.
* \param[in] len The length of the vectors.
*/
static void inner_var_to_check_c_avx512(const __m512i* x, const __m512i* y, __m512i* z, uint8_t clip, uint32_t len);
/*!
* Rotate the contents of a node towards the right by \b shift chars, that is the
* \b shift * 8 most significant bits become the least significant ones.
* \param[in] mem_addr The node to rotate.
* \param[out] out The rotated node.
* \param[in] shift The order of the rotation in number of chars.
* \param[in] ls The size of the node (lifting size).
* \param[in] n_subnodes The number of subnodes in each node.
*/
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift, uint16_t ls);
/*!
* Scale packed 8-bit integers in \b a by the scaling factor \b sf / #F2I.
* \param[in] a Vector of packed 8-bit integers.
* \param[in] sf Scaling factor.
* \return Vector of packed 8-bit integers with the scaling result.
*/
static __m512i _mm512_scalei_epi8(__m512i a, __m512i sf);
void* create_ldpc_dec_c_avx512long_flood(uint8_t bgN, uint8_t bgM, uint16_t ls, float scaling_fctr)
{
struct ldpc_regs_c_avx512long_flood* vp = NULL;
uint8_t bgK = bgN - bgM;
uint16_t hrr = bgK + 4;
if ((vp = srslte_vec_malloc(sizeof(struct ldpc_regs_c_avx512long_flood))) == NULL) {
return NULL;
}
// compute number of subnodes
int left_out = ls % SRSLTE_AVX512_B_SIZE;
int n_subnodes = ls / SRSLTE_AVX512_B_SIZE + (left_out > 0);
if ((vp->llrs = srslte_vec_malloc(bgN * n_subnodes * sizeof(__m512i))) == NULL) {
free(vp);
return NULL;
}
if ((vp->soft_bits = srslte_vec_malloc(bgN * n_subnodes * sizeof(bg_node_avx512_t))) == NULL) {
free(vp->llrs);
free(vp);
return NULL;
}
if ((vp->check_to_var = srslte_vec_malloc((hrr + 1) * bgM * n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->soft_bits);
free(vp->llrs);
free(vp);
return NULL;
}
if ((vp->var_to_check_to_free = srslte_vec_malloc(((hrr + 1) * bgM * n_subnodes + 2) * sizeof(__m512i))) == NULL) {
free(vp->check_to_var);
free(vp->soft_bits);
free(vp->llrs);
free(vp);
return NULL;
}
vp->var_to_check = &vp->var_to_check_to_free[1];
if ((vp->minp_v2c_epi8 = srslte_vec_malloc(n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp->llrs);
free(vp);
return NULL;
}
if ((vp->mins_v2c_epi8 = srslte_vec_malloc(n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp->llrs);
free(vp);
return NULL;
}
if ((vp->prod_v2c_epi8 = srslte_vec_malloc(n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp->llrs);
free(vp);
return NULL;
}
if ((vp->min_ix_epi8 = srslte_vec_malloc(n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->prod_v2c_epi8);
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp->llrs);
free(vp);
return NULL;
}
if ((vp->rotated_v2c = srslte_vec_malloc((hrr + 1) * n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->min_ix_epi8);
free(vp->prod_v2c_epi8);
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp->llrs);
free(vp);
return NULL;
}
if ((vp->this_c2v_epi8_to_free = srslte_vec_malloc((n_subnodes + 2) * sizeof(__m512i))) == NULL) {
free(vp->rotated_v2c);
free(vp->min_ix_epi8);
free(vp->prod_v2c_epi8);
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp->llrs);
free(vp);
return NULL;
}
vp->this_c2v_epi8 = &vp->this_c2v_epi8_to_free[1];
vp->bgM = bgM;
vp->bgN = bgN;
vp->hrr = hrr;
vp->ls = ls;
vp->n_subnodes = n_subnodes;
// correction > 1/16 to compensate the scaling error (2^16-1)/2^16 incurred in _mm512_scalei_epi8
vp->scaling_fctr = _mm512_set1_epi16((uint16_t)((scaling_fctr + 0.00001525879) * F2I));
return vp;
}
void delete_ldpc_dec_c_avx512long_flood(void* p)
{
struct ldpc_regs_c_avx512long_flood* vp = p;
if (vp != NULL) {
free(vp->this_c2v_epi8_to_free);
free(vp->rotated_v2c);
free(vp->min_ix_epi8);
free(vp->prod_v2c_epi8);
free(vp->mins_v2c_epi8);
free(vp->minp_v2c_epi8);
free(vp->var_to_check_to_free);
free(vp->check_to_var);
free(vp->soft_bits);
free(vp->llrs);
free(vp);
}
}
int init_ldpc_dec_c_avx512long_flood(void* p, const int8_t* llrs, uint16_t ls)
{
struct ldpc_regs_c_avx512long_flood* vp = p;
int i = 0;
int j = 0;
int k = 0;
if (p == NULL) {
return -1;
}
for (k = 0; k < vp->n_subnodes; k++) {
vp->soft_bits[k].v = _mm512_set1_epi8(0);
vp->soft_bits[vp->n_subnodes + k].v = _mm512_set1_epi8(0);
vp->llrs[k] = _mm512_set1_epi8(0);
vp->llrs[vp->n_subnodes + k] = _mm512_set1_epi8(0);
}
for (i = 2; i < vp->bgN; i++) {
for (j = 0; j < vp->n_subnodes; j++) {
for (k = 0; (k < SRSLTE_AVX512_B_SIZE) && (j * SRSLTE_AVX512_B_SIZE + k < ls); k++) {
vp->soft_bits[i * vp->n_subnodes + j].c[k] = llrs[(i - 2) * ls + j * SRSLTE_AVX512_B_SIZE + k];
}
vp->llrs[i * vp->n_subnodes + j] = vp->soft_bits[i * vp->n_subnodes + j].v;
}
bzero(&(vp->soft_bits[i * vp->n_subnodes + j - 1].c[k]), (SRSLTE_AVX512_B_SIZE - k) * sizeof(int8_t));
bzero((int8_t*)(vp->llrs + i * vp->n_subnodes + j - 1) + k, (SRSLTE_AVX512_B_SIZE - k) * sizeof(int8_t));
}
bzero(vp->check_to_var, (vp->hrr + 1) * vp->bgM * vp->n_subnodes * sizeof(__m512i));
bzero(vp->var_to_check, (vp->hrr + 1) * vp->bgM * vp->n_subnodes * sizeof(__m512i));
return 0;
}
int update_ldpc_var_to_check_c_avx512long_flood(void* p, int i_layer)
{
struct ldpc_regs_c_avx512long_flood* vp = p;
if (p == NULL) {
return -1;
}
__m512i* this_check_to_var = vp->check_to_var + i_layer * (vp->hrr + 1) * vp->n_subnodes;
__m512i* this_var_to_check = vp->var_to_check + i_layer * (vp->hrr + 1) * vp->n_subnodes;
// Update the high-rate region.
inner_var_to_check_c_avx512(
&(vp->soft_bits[0].v), this_check_to_var, this_var_to_check, infinity7, vp->hrr * vp->n_subnodes);
if (i_layer >= 4) {
// Update the extension region.
inner_var_to_check_c_avx512(&(vp->soft_bits[0].v) + (vp->hrr + i_layer - 4) * vp->n_subnodes,
this_check_to_var + vp->hrr * vp->n_subnodes,
this_var_to_check + vp->hrr * vp->n_subnodes,
infinity7,
vp->n_subnodes);
}
return 0;
}
int update_ldpc_check_to_var_c_avx512long_flood(void* p,
int i_layer,
const uint16_t* this_pcm,
const int8_t (*these_var_indices)[MAX_CNCT])
{
struct ldpc_regs_c_avx512long_flood* vp = p;
if (p == NULL) {
return -1;
}
int i = 0;
int j = 0;
uint16_t shift = 0;
int i_v2c_base = 0;
__m512i* this_rotated_v2c = NULL;
__m512i* this_var_to_check = (vp->var_to_check + i_layer * (vp->hrr + 1) * vp->n_subnodes);
__m512i this_abs_v2c_epi8;
__mmask64 mask_min_epi8;
__m512i help_min_epi8;
__m512i current_ix_epi8;
for (j = 0; j < vp->n_subnodes; j++) {
vp->minp_v2c_epi8[j] = _mm512_set1_epi8(INT8_MAX);
vp->mins_v2c_epi8[j] = _mm512_set1_epi8(INT8_MAX);
vp->prod_v2c_epi8[j] = _mm512_set1_epi8(0);
}
int8_t current_var_index = (*these_var_indices)[0];
for (i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {
shift = this_pcm[current_var_index];
i_v2c_base = (current_var_index <= vp->hrr) ? current_var_index : vp->hrr;
i_v2c_base *= vp->n_subnodes;
current_ix_epi8 = _mm512_set1_epi8((int8_t)i);
this_rotated_v2c = vp->rotated_v2c + i * vp->n_subnodes;
rotate_node_right((uint8_t*)(this_var_to_check + i_v2c_base), this_rotated_v2c, shift, vp->ls);
for (j = 0; j < vp->n_subnodes; j++) {
vp->prod_v2c_epi8[j] = _mm512_xor_si512(vp->prod_v2c_epi8[j], this_rotated_v2c[j]);
this_abs_v2c_epi8 = _mm512_abs_epi8(this_rotated_v2c[j]);
// mask_min is 1 if this_abs_v2c is strictly smaller tha minp_v2c
mask_min_epi8 = _mm512_cmpgt_epi8_mask(vp->minp_v2c_epi8[j], this_abs_v2c_epi8);
help_min_epi8 = _mm512_mask_blend_epi8(mask_min_epi8, this_abs_v2c_epi8, vp->minp_v2c_epi8[j]);
vp->minp_v2c_epi8[j] = _mm512_mask_blend_epi8(mask_min_epi8, vp->minp_v2c_epi8[j], this_abs_v2c_epi8);
vp->min_ix_epi8[j] = _mm512_mask_blend_epi8(mask_min_epi8, vp->min_ix_epi8[j], current_ix_epi8);
// mask_min is 1 if this_abs_v2c is strictly smaller tha mins_v2c
mask_min_epi8 = _mm512_cmpgt_epi8_mask(vp->mins_v2c_epi8[j], this_abs_v2c_epi8);
vp->mins_v2c_epi8[j] = _mm512_mask_blend_epi8(mask_min_epi8, vp->mins_v2c_epi8[j], help_min_epi8);
}
current_var_index = (*these_var_indices)[i + 1];
}
__m512i* this_check_to_var = vp->check_to_var + i_layer * (vp->hrr + 1) * vp->n_subnodes;
current_var_index = (*these_var_indices)[0];
__mmask64 mask_is_min_epi8;
__m512i final_sign_epi8;
for (i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {
shift = this_pcm[current_var_index];
i_v2c_base = (current_var_index <= vp->hrr) ? current_var_index : vp->hrr;
i_v2c_base *= vp->n_subnodes;
this_rotated_v2c = vp->rotated_v2c + i * vp->n_subnodes;
for (j = 0; j < vp->n_subnodes; j++) {
// mask_sign is 1 if this_v2c_epi8 is strictly negative
final_sign_epi8 = _mm512_xor_si512(this_rotated_v2c[j], vp->prod_v2c_epi8[j]);
current_ix_epi8 = _mm512_set1_epi8((int8_t)i);
mask_is_min_epi8 = _mm512_cmpeq_epi8_mask(current_ix_epi8, vp->min_ix_epi8[j]);
vp->this_c2v_epi8[j] = _mm512_mask_blend_epi8(mask_is_min_epi8, vp->minp_v2c_epi8[j], vp->mins_v2c_epi8[j]);
vp->this_c2v_epi8[j] = _mm512_scalei_epi8(vp->this_c2v_epi8[j], vp->scaling_fctr);
// does *not* do anything special for signs[i] == 0, just negative / non-negative
__mmask64 negmask = _mm512_movepi8_mask(final_sign_epi8); // transform final_sing_epi8 into a mask
vp->this_c2v_epi8[j] =
_mm512_mask_sub_epi8(vp->this_c2v_epi8[j], negmask, _mm512_setzero_si512(), vp->this_c2v_epi8[j]);
}
// rotating right LS - shift positions is the same as rotating left shift positions
rotate_node_right((uint8_t*)vp->this_c2v_epi8, this_check_to_var + i_v2c_base, (vp->ls - shift) % vp->ls, vp->ls);
current_var_index = (*these_var_indices)[i + 1];
}
return 0;
}
int update_ldpc_soft_bits_c_avx512long_flood(void* p, const int8_t (*these_var_indices)[MAX_CNCT])
{
struct ldpc_regs_c_avx512long_flood* vp = p;
if (p == NULL) {
return -1;
}
int i_layer = 0;
int i = 0;
int j = 0;
__m512i* this_check_to_var = NULL;
int i_bit_tmp_base = 0;
int i_bit_subnode = 0;
__m512i tmp_epi8;
__mmask64 mask_epi8;
int8_t current_var_index = 0;
int current_var_index_subnode = 0;
for (i = 0; i < vp->bgN; i++) {
for (j = 0; j < vp->n_subnodes; j++) {
vp->soft_bits[i * vp->n_subnodes + j].v = vp->llrs[i * vp->n_subnodes + j];
}
}
for (i_layer = 0; i_layer < vp->bgM; i_layer++) {
current_var_index = these_var_indices[i_layer][0];
this_check_to_var = vp->check_to_var + i_layer * (vp->hrr + 1) * vp->n_subnodes;
for (i = 0; (current_var_index != -1) && (i < MAX_CNCT); i++) {
current_var_index_subnode = current_var_index * vp->n_subnodes;
for (j = 0; j < vp->n_subnodes; j++) {
i_bit_tmp_base = (current_var_index <= vp->hrr) ? current_var_index : vp->hrr;
i_bit_subnode = i_bit_tmp_base * vp->n_subnodes + j;
tmp_epi8 = _mm512_adds_epi8(this_check_to_var[i_bit_subnode], vp->soft_bits[current_var_index_subnode + j].v);
mask_epi8 = _mm512_cmpgt_epi8_mask(tmp_epi8, _mm512_infty7_epi8);
tmp_epi8 = _mm512_mask_blend_epi8(mask_epi8, tmp_epi8, _mm512_infty8_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(_mm512_neg_infty7_epi8, tmp_epi8);
vp->soft_bits[current_var_index_subnode + j].v =
_mm512_mask_blend_epi8(mask_epi8, tmp_epi8, _mm512_neg_infty8_epi8);
}
current_var_index = these_var_indices[i_layer][i + 1];
}
}
return 0;
}
int extract_ldpc_message_c_avx512long_flood(void* p, uint8_t* message, uint16_t liftK)
{
if (p == NULL) {
return -1;
}
struct ldpc_regs_c_avx512long_flood* vp = p;
int j = 0;
int k = 0;
for (int i = 0; i < liftK / vp->ls; i++) {
for (j = 0; j < vp->n_subnodes; j++) {
for (k = 0; (k < SRSLTE_AVX512_B_SIZE) && (j * SRSLTE_AVX512_B_SIZE + k < vp->ls); k++) {
message[i * vp->ls + j * SRSLTE_AVX512_B_SIZE + k] = (vp->soft_bits[i * vp->n_subnodes + j].c[k] < 0);
}
}
}
return 0;
}
static void
inner_var_to_check_c_avx512(const __m512i* x, const __m512i* y, __m512i* z, const uint8_t clip, const uint32_t len)
{
unsigned i = 0;
__m512i x_epi8;
__m512i y_epi8;
__m512i z_epi8;
__mmask64 mask_epi8;
__m512i help_sub_epi8;
__m512i clip_epi8 = _mm512_set1_epi8(clip);
__m512i neg_clip_epi8 = _mm512_set1_epi8((char)(-clip));
// len = number of subnodes of size __m512
for (i = 0; i < len; i++) {
x_epi8 = x[i];
y_epi8 = y[i];
help_sub_epi8 = _mm512_subs_epi8(x_epi8, y_epi8); // x-y
mask_epi8 = _mm512_cmpgt_epi8_mask(help_sub_epi8, clip_epi8); // saturate to clip insteaof inifinty8
z_epi8 = _mm512_mask_blend_epi8(mask_epi8, help_sub_epi8, clip_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(neg_clip_epi8, z_epi8);
z_epi8 = _mm512_mask_blend_epi8(mask_epi8, z_epi8, neg_clip_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(_mm512_infty8_epi8, x_epi8);
z_epi8 = _mm512_mask_blend_epi8(mask_epi8, _mm512_infty8_epi8, z_epi8);
mask_epi8 = _mm512_cmpgt_epi8_mask(x_epi8, _mm512_neg_infty8_epi8);
z[i] = _mm512_mask_blend_epi8(mask_epi8, _mm512_neg_infty8_epi8, z_epi8);
}
}
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift, uint16_t ls)
{
uint16_t shift = 0;
uint16_t _shift = 0;
uint64_t mask1 = 0;
uint64_t mask2 = 0;
const __m512i MZERO = _mm512_set1_epi8(0);
// the part in the middle - we simply copy.
int j = 0;
int jj = 0;
// copy full avx512 registers from this_shift_2
for (j = this_shift; j <= ls - SRSLTE_AVX512_B_SIZE; j = j + SRSLTE_AVX512_B_SIZE) {
out[jj] = _mm512_loadu_si512(mem_addr + j);
jj = jj + 1;
}
// if the last is broken, take _shift bits from the end and "shift" bits from the begin.
if (ls > j) {
_shift = ls - j;
shift = SRSLTE_AVX512_B_SIZE - _shift;
mask1 = (1ULL << _shift) - 1; // i.e. 000001111 _shift =4
mask2 = (1ULL << shift) - 1;
mask2 = mask2 << _shift; // i.e. 000110000 shift = 2, _shift = 4
out[jj] = _mm512_mask_loadu_epi8(MZERO, mask1, mem_addr + j);
out[jj] = _mm512_mask_loadu_epi8(out[jj], mask2, mem_addr - _shift);
jj = jj + 1;
}
// copy full avx512 registers from the start of mem_addr,
for (j = shift; j < this_shift; j = j + SRSLTE_AVX512_B_SIZE) {
out[jj] = _mm512_loadu_si512(mem_addr + j); // the excess is filled with something arbitrary
jj = jj + 1;
}
}
static __m512i _mm512_scalei_epi8(__m512i a, __m512i sf)
{
__m512i even_epi16 = _mm512_and_si512(a, _mm512_mask_even_epi8);
__m512i odd_epi16 = _mm512_srli_epi16(a, 8);
__m512i p_even_epi16 = _mm512_mulhi_epu16(even_epi16, sf);
__m512i p_odd_epi16 = _mm512_mulhi_epu16(odd_epi16, sf);
p_odd_epi16 = _mm512_slli_epi16(p_odd_epi16, 8);
return _mm512_xor_si512(p_even_epi16, p_odd_epi16);
}
#endif // LV_HAVE_AVX512

@ -23,6 +23,7 @@
#include <stdint.h>
#include "../utils_avx2.h"
#include "../utils_avx512.h"
#include "ldpc_dec_all.h"
#include "srslte/phy/fec/ldpc/base_graph.h"
#include "srslte/phy/fec/ldpc/ldpc_decoder.h"
@ -660,6 +661,250 @@ static int init_c_avx2long_flood(srslte_ldpc_decoder_t* q)
}
#endif // LV_HAVE_AVX2
// AVX512 Declarations
#ifdef LV_HAVE_AVX512
/*! Carries out the actual destruction of the memory allocated to the decoder, 8-bit-LLR case (AVX512 implementation).
*/
static void free_dec_c_avx512(void* o)
{
srslte_ldpc_decoder_t* q = o;
if (q->var_indices) {
free(q->var_indices);
}
if (q->pcm) {
free(q->pcm);
}
delete_ldpc_dec_c_avx512(q->ptr);
}
/*! Carries out the decoding with 8-bit integer-valued LLRs (AVX512 implementation). */
static int decode_c_avx512(void* o, const int8_t* llrs, uint8_t* message, uint32_t cdwd_rm_length)
{
srslte_ldpc_decoder_t* q = o;
// it must be smaller than the codeword size
if (cdwd_rm_length > q->liftN - 2 * q->ls) {
cdwd_rm_length = q->liftN - 2 * q->ls;
}
// We need at least q->bgK + 4 variable nodes to cover the high-rate region. However,
// 2 variable nodes are systematically punctured by the encoder.
if (cdwd_rm_length < (q->bgK + 2) * q->ls) {
// ERROR("The rate-matched codeword should have a length at least equal to the high-rate region.");
cdwd_rm_length = (q->bgK + 2) * q->ls;
// return -1;
}
if (cdwd_rm_length % q->ls) {
cdwd_rm_length = (cdwd_rm_length / q->ls + 1) * q->ls;
// ERROR("The rate-matched codeword length should be a multiple of the lifting size.");
// return -1;
}
init_ldpc_dec_c_avx512(q->ptr, llrs, q->ls);
uint16_t* this_pcm = NULL;
int8_t(*these_var_indices)[MAX_CNCT] = NULL;
// When computing the number of layers, we need to recall that the standard always removes
// the first two variable nodes from the final codeword.
uint8_t n_layers = cdwd_rm_length / q->ls - q->bgK + 2;
for (int i_iteration = 0; i_iteration < MAX_ITERATIONS; i_iteration++) {
for (int i_layer = 0; i_layer < n_layers; i_layer++) {
update_ldpc_var_to_check_c_avx512(q->ptr, i_layer);
this_pcm = q->pcm + i_layer * q->bgN;
these_var_indices = q->var_indices + i_layer;
update_ldpc_check_to_var_c_avx512(q->ptr, i_layer, this_pcm, these_var_indices);
update_ldpc_soft_bits_c_avx512(q->ptr, i_layer, these_var_indices);
}
}
extract_ldpc_message_c_avx512(q->ptr, message, q->liftK);
return 0;
}
/*! Initializes the decoder to work with 8-bit integer-valued LLRs (AVX512 implementation). */
static int init_c_avx512(srslte_ldpc_decoder_t* q)
{
q->free = free_dec_c_avx512;
if ((q->ptr = create_ldpc_dec_c_avx512(q->bgN, q->bgM, q->ls, q->scaling_fctr)) == NULL) {
ERROR("Create_ldpc_dec failed");
free_dec_c_avx512(q);
return -1;
}
q->decode_c = decode_c_avx512;
return 0;
}
/*! Carries out the actual destruction of the memory allocated to the decoder, 8-bit-LLR case (AVX512 implementation,
* large lifting size). */
static void free_dec_c_avx512long(void* o)
{
srslte_ldpc_decoder_t* q = o;
if (q->var_indices) {
free(q->var_indices);
}
if (q->pcm) {
free(q->pcm);
}
delete_ldpc_dec_c_avx512long(q->ptr);
}
/*! Carries out the decoding with 8-bit integer-valued LLRs (AVX512 implementation, large lifting size). */
static int decode_c_avx512long(void* o, const int8_t* llrs, uint8_t* message, uint32_t cdwd_rm_length)
{
srslte_ldpc_decoder_t* q = o;
// it must be smaller than the codeword size
if (cdwd_rm_length > q->liftN - 2 * q->ls) {
cdwd_rm_length = q->liftN - 2 * q->ls;
}
// We need at least q->bgK + 4 variable nodes to cover the high-rate region. However,
// 2 variable nodes are systematically punctured by the encoder.
if (cdwd_rm_length < (q->bgK + 2) * q->ls) {
// ERROR("The rate-matched codeword should have a length at least equal to the high-rate region.");
cdwd_rm_length = (q->bgK + 2) * q->ls;
// return -1;
}
if (cdwd_rm_length % q->ls) {
cdwd_rm_length = (cdwd_rm_length / q->ls + 1) * q->ls;
// ERROR("The rate-matched codeword length should be a multiple of the lifting size.");
// return -1;
}
init_ldpc_dec_c_avx512long(q->ptr, llrs, q->ls);
uint16_t* this_pcm = NULL;
int8_t(*these_var_indices)[MAX_CNCT] = NULL;
// When computing the number of layers, we need to recall that the standard always removes
// the first two variable nodes from the final codeword.
uint8_t n_layers = cdwd_rm_length / q->ls - q->bgK + 2;
for (int i_iteration = 0; i_iteration < MAX_ITERATIONS; i_iteration++) {
for (int i_layer = 0; i_layer < n_layers; i_layer++) {
update_ldpc_var_to_check_c_avx512long(q->ptr, i_layer);
this_pcm = q->pcm + i_layer * q->bgN;
these_var_indices = q->var_indices + i_layer;
update_ldpc_check_to_var_c_avx512long(q->ptr, i_layer, this_pcm, these_var_indices);
update_ldpc_soft_bits_c_avx512long(q->ptr, i_layer, these_var_indices);
}
}
extract_ldpc_message_c_avx512long(q->ptr, message, q->liftK);
return 0;
}
/*! Initializes the decoder to work with 8-bit integer-valued LLRs (AVX512 implementation, large lifting size). */
static int init_c_avx512long(srslte_ldpc_decoder_t* q)
{
q->free = free_dec_c_avx512long;
if ((q->ptr = create_ldpc_dec_c_avx512long(q->bgN, q->bgM, q->ls, q->scaling_fctr)) == NULL) {
ERROR("Create_ldpc_dec failed\n");
free_dec_c_avx512long(q);
return -1;
}
q->decode_c = decode_c_avx512long;
return 0;
}
/*! Carries out the actual destruction of the memory allocated to the decoder, 8-bit-LLR case
* (flooded scheduling, AVX512 implementation, large lifting size). */
static void free_dec_c_avx512long_flood(void* o)
{
srslte_ldpc_decoder_t* q = o;
if (q->var_indices) {
free(q->var_indices);
}
if (q->pcm) {
free(q->pcm);
}
delete_ldpc_dec_c_avx512long_flood(q->ptr);
}
/*! Carries out the decoding with 8-bit integer-valued LLRs (flooded scheduling, AVX512 implementation, large lifting
* size). */
static int decode_c_avx512long_flood(void* o, const int8_t* llrs, uint8_t* message, uint32_t cdwd_rm_length)
{
srslte_ldpc_decoder_t* q = o;
// it must be smaller than the codeword size
if (cdwd_rm_length > q->liftN - 2 * q->ls) {
cdwd_rm_length = q->liftN - 2 * q->ls;
}
// We need at least q->bgK + 4 variable nodes to cover the high-rate region. However,
// 2 variable nodes are systematically punctured by the encoder.
if (cdwd_rm_length < (q->bgK + 2) * q->ls) {
// ERROR("The rate-matched codeword should have a length at least equal to the high-rate region.");
cdwd_rm_length = (q->bgK + 2) * q->ls;
// return -1;
}
if (cdwd_rm_length % q->ls) {
cdwd_rm_length = (cdwd_rm_length / q->ls + 1) * q->ls;
// ERROR("The rate-matched codeword length should be a multiple of the lifting size.");
// return -1;
}
init_ldpc_dec_c_avx512long_flood(q->ptr, llrs, q->ls);
uint16_t* this_pcm = NULL;
int8_t(*these_var_indices)[MAX_CNCT] = NULL;
// When computing the number of layers, we need to recall that the standard always removes
// the first two variable nodes from the final codeword.
uint8_t n_layers = cdwd_rm_length / q->ls - q->bgK + 2;
for (int i_iteration = 0; i_iteration < 2 * MAX_ITERATIONS; i_iteration++) {
for (int i_layer = 0; i_layer < n_layers; i_layer++) {
update_ldpc_var_to_check_c_avx512long_flood(q->ptr, i_layer);
}
for (int i_layer = 0; i_layer < n_layers; i_layer++) {
this_pcm = q->pcm + i_layer * q->bgN;
these_var_indices = q->var_indices + i_layer;
update_ldpc_check_to_var_c_avx512long_flood(q->ptr, i_layer, this_pcm, these_var_indices);
}
update_ldpc_soft_bits_c_avx512long_flood(q->ptr, q->var_indices);
}
extract_ldpc_message_c_avx512long_flood(q->ptr, message, q->liftK);
return 0;
}
/*! Initializes the decoder to work with 8-bit integer-valued LLRs
* (flooded scheduling, AVX512 implementation, large lifting size). */
static int init_c_avx512long_flood(srslte_ldpc_decoder_t* q)
{
q->free = free_dec_c_avx512long_flood;
if ((q->ptr = create_ldpc_dec_c_avx512long_flood(q->bgN, q->bgM, q->ls, q->scaling_fctr)) == NULL) {
ERROR("Create_ldpc_dec failed");
free_dec_c_avx512long_flood(q);
return -1;
}
q->decode_c = decode_c_avx512long_flood;
return 0;
}
#endif // LV_HAVE_AVX512
int srslte_ldpc_decoder_init(srslte_ldpc_decoder_t* q,
srslte_ldpc_decoder_type_t type,
srslte_basegraph_t bg,
@ -745,6 +990,17 @@ int srslte_ldpc_decoder_init(srslte_ldpc_decoder_t* q,
return init_c_avx2long_flood(q);
}
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
case SRSLTE_LDPC_DECODER_C_AVX512:
if (ls <= SRSLTE_AVX512_B_SIZE) {
return init_c_avx512(q);
} else {
return init_c_avx512long(q);
}
case SRSLTE_LDPC_DECODER_C_AVX512_FLOOD:
return init_c_avx512long_flood(q);
#endif // LV_HAVE_AVX2
default:
ERROR("Unknown decoder.");
return -1;

@ -63,7 +63,7 @@ void encode_high_rate_case4(void* o, uint8_t* output);
void encode_ext_region(srslte_ldpc_encoder_t* q, uint8_t* output, uint8_t n_layers);
/*!
* Creates the inner registers required by the optimized LDPC encoder (LS <= \ref SRSLTE_AVX2_B_SIZE).
* Creates the inner registers required by the optimized LDPC encoder (LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] q A pointer to an encoder.
* \return A pointer to the newly created structure of registers.
*/
@ -118,7 +118,8 @@ void encode_high_rate_case2_avx2(void* o);
void encode_high_rate_case3_avx2(void* o);
/*! Computes the high-rate parity bits for BG2 and ls_index in {3, 7} (SIMD-optimized version, LS <= \ref
* SRSLTE_AVX2_B_SIZE). \param[in,out] o A pointer to an encoder.
* SRSLTE_AVX2_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case4_avx2(void* o);
@ -199,4 +200,212 @@ void encode_high_rate_case4_avx2long(void* o);
*/
void encode_ext_region_avx2long(srslte_ldpc_encoder_t* q, uint8_t n_layers);
/*!
* Creates the inner registers required by the optimized LDPC encoder (LS <= \ref SRSLTE_AVX2_B_SIZE).
* \param[in,out] q A pointer to an encoder.
* \return A pointer to the newly created structure of registers.
*/
void* create_ldpc_enc_avx2(srslte_ldpc_encoder_t* q);
/*!
* Deletes the inner registers of an optimized LDPC encoder (LS <= \ref SRSLTE_AVX2_B_SIZE).
* \param[in] p A pointer to the register structure.
*/
void delete_ldpc_enc_avx2(void* p);
/*!
* Loads the message in the opimized encoder registers (LS <= \ref SRSLTE_AVX2_B_SIZE).
* \param[in] p The register structure.
* \param[in] input The message to encode.
* \param[in] msg_len Number of variable nodes in one message.
* \param[in] cdwd_len Number of variable nodes in one message.
* \param[in] ls The lifting size.
* \return Error code: 0 if correct, -1 otherwise.
*/
int load_avx2(void* p, const uint8_t* input, uint8_t msg_len, uint8_t cdwd_len, uint16_t ls);
/*! Extracts the final codeword from the optimized encoder registers (LS <= \ref SRSLTE_AVX2_B_SIZE).
* \param[in] p The register structure.
* \param[out] output The output codeword.
* \param[in] cdwd_len The number of variable nodes (after rate-matching, if enabled).
* \param[in] ls The lifting size.
* \return Error code: 0 if correct, -1 otherwise.
*/
int return_codeword_avx2(void* p, uint8_t* output, uint8_t cdwd_len, uint16_t ls);
/*! Computes the product between the first (K - 2) columns of the PCM and the
* systematic bits (SIMD-optimized version, LS <= \ref SRSLTE_AVX2_B_SIZE).
* \param[in,out] q A pointer to an encoder.
*/
void preprocess_systematic_bits_avx2(srslte_ldpc_encoder_t* q);
/*! Computes the high-rate parity bits for BG1 and ls_index in {0, 1, 2, 3, 4, 5, 7}
* (SIMD-optimized version, LS <= \ref SRSLTE_AVX2_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case1_avx2(void* o);
/*! Computes the high-rate parity bits for BG1 and ls_index in {6} (SIMD-optimized version, LS <= \ref
* SRSLTE_AVX2_B_SIZE). \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case2_avx2(void* o);
/*! Computes the high-rate parity bits for BG2 and ls_index in {0, 1, 2, 4, 5, 6} (SIMD-optimized version, LS <= \ref
* SRSLTE_AVX2_B_SIZE). \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case3_avx2(void* o);
/*! Computes the high-rate parity bits for BG2 and ls_index in {3, 7} (SIMD-optimized version, LS <= \ref
* SRSLTE_AVX2_B_SIZE).
\param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case4_avx2(void* o);
/*! Computes the extended-region parity bits (SIMD-optimized version, LS <= \ref SRSLTE_AVX2_B_SIZE).
* \param[in,out] q A pointer to an encoder.
* \param[in] n_layers The number of layers to process (when doing rate matching not all
* layers are needed).
*/
void encode_ext_region_avx2(srslte_ldpc_encoder_t* q, uint8_t n_layers);
/*!
* Creates the inner registers required by the optimized LDPC encoder (for LS > \ref SRSLTE_avx512_B_SIZE).
* \param[in,out] q A pointer to an encoder.
* \return A pointer to the newly created structure of registers.
*/
void* create_ldpc_enc_avx512long(srslte_ldpc_encoder_t* q);
/*!
* Deletes the inner registers of an optimized LDPC encoder (LS > \ref SRSLTE_avx512_B_SIZE).
* \param[in] p A pointer to the register structure.
*/
void delete_ldpc_enc_avx512long(void* p);
/*!
* Loads the message in the optimized encoder registers (LS > \ref SRSLTE_avx512_B_SIZE).
* \param[in] p The register structure.
* \param[in] input The message to encode.
* \param[in] msg_len Number of variable nodes in one message.
* \param[in] cdwd_len Number of variable nodes in one message.
* \param[in] ls The lifting size.
* \return Error code: 0 if correct, -1 otherwise.
*/
int load_avx512long(void* p, const uint8_t* input, uint8_t msg_len, uint8_t cdwd_len, uint16_t ls);
/*! Extracts the final codeword from the optimized encoder registers (LS > \ref SRSLTE_avx512_B_SIZE).
* \param[in] p The register structure.
* \param[out] output The output codeword.
* \param[in] cdwd_len The number of variable nodes (after rate-matching, if enabled).
* \param[in] ls The lifting size.
* \return Error code: 0 if correct, -1 otherwise.
*/
int return_codeword_avx512long(void* p, uint8_t* output, uint8_t cdwd_len, uint16_t ls);
/*! Computes the product between the first (K - 2) columns of the PCM and the
* systematic bits (SIMD-optimized version, LS > \ref SRSLTE_avx512_B_SIZE).
* \param[in,out] q A pointer to an encoder.
*/
void preprocess_systematic_bits_avx512long(srslte_ldpc_encoder_t* q);
/*! Computes the high-rate parity bits for BG1 and ls_index in {0, 1, 2, 3, 4, 5, 7}
* (SIMD-optimized version, LS > \ref SRSLTE_avx512_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case1_avx512long(void* o);
/*! Computes the high-rate parity bits for BG1 and ls_index in {6} (SIMD-optimized version, LS > \ref
* SRSLTE_avx512_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case2_avx512long(void* o);
/*! Computes the high-rate parity bits for BG2 and ls_index in {0, 1, 2, 4, 5, 6} (SIMD-optimized version, LS > \ref
* SRSLTE_avx512_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case3_avx512long(void* o);
/*! Computes the high-rate parity bits for BG2 and ls_index in {3, 7} (SIMD-optimized version, LS > \ref
* SRSLTE_avx512_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case4_avx512long(void* o);
/*! Computes the extended-region parity bits (SIMD-optimized version, LS > \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] q A pointer to an encoder.
* \param[in] n_layers The number of layers to process (when doing rate matching not all
* layers are needed).
*/
void encode_ext_region_avx512long(srslte_ldpc_encoder_t* q, uint8_t n_layers);
/*!
* Creates the inner registers required by the optimized LDPC encoder (for LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] q A pointer to an encoder.
* \return A pointer to the newly created structure of registers.
*/
void* create_ldpc_enc_avx512(srslte_ldpc_encoder_t* q);
/*!
* Deletes the inner registers of an optimized LDPC encoder (LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in] p A pointer to the register structure.
*/
void delete_ldpc_enc_avx512(void* p);
/*!
* Loads the message in the optimized encoder registers (LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in] p The register structure.
* \param[in] input The message to encode.
* \param[in] msg_len Number of variable nodes in one message.
* \param[in] cdwd_len Number of variable nodes in one message.
* \param[in] ls The lifting size.
* \return Error code: 0 if correct, -1 otherwise.
*/
int load_avx512(void* p, const uint8_t* input, uint8_t msg_len, uint8_t cdwd_len, uint16_t ls);
/*! Extracts the final codeword from the optimized encoder registers (LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in] p The register structure.
* \param[out] output The output codeword.
* \param[in] cdwd_len The number of variable nodes (after rate-matching, if enabled).
* \param[in] ls The lifting size.
* \return Error code: 0 if correct, -1 otherwise.
*/
int return_codeword_avx512(void* p, uint8_t* output, uint8_t cdwd_len, uint16_t ls);
/*! Computes the product between the first (K - 2) columns of the PCM and the
* systematic bits (SIMD-optimized version, LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] q A pointer to an encoder.
*/
void preprocess_systematic_bits_avx512(srslte_ldpc_encoder_t* q);
/*! Computes the high-rate parity bits for BG1 and ls_index in {0, 1, 2, 3, 4, 5, 7}
* (SIMD-optimized version, LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case1_avx512(void* o);
/*! Computes the high-rate parity bits for BG1 and ls_index in {6} (SIMD-optimized version, LS <= \ref
* SRSLTE_AVX512_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case2_avx512(void* o);
/*! Computes the high-rate parity bits for BG2 and ls_index in {0, 1, 2, 4, 5, 6} (SIMD-optimized version, LS > \ref
* SRSLTE_AVX512_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case3_avx512(void* o);
/*! Computes the high-rate parity bits for BG2 and ls_index in {3, 7} (SIMD-optimized version, LS <= \ref
* SRSLTE_AVX512_B_SIZE).
* \param[in,out] o A pointer to an encoder.
*/
void encode_high_rate_case4_avx512(void* o);
/*! Computes the extended-region parity bits (SIMD-optimized version, LS <= \ref SRSLTE_AVX512_B_SIZE).
* \param[in,out] q A pointer to an encoder.
* \param[in] n_layers The number of layers to process (when doing rate matching not all
* layers are needed).
*/
void encode_ext_region_avx512(srslte_ldpc_encoder_t* q, uint8_t n_layers);
#endif // SRSLTE_LDPCENC_ALL_H

@ -132,16 +132,18 @@ int load_avx2(void* p, const uint8_t* input, const uint8_t msg_len, const uint8_
return -1;
}
int i = 0;
int k = 0;
for (; i < msg_len; i++) {
for (k = 0; k < ls; k++) {
vp->codeword[i].c[k] = input[i * ls + k];
int ini = 0;
int node_size = SRSLTE_AVX2_B_SIZE;
for (int i = 0; i < msg_len * ls; i = i + ls) {
for (int k = 0; k < ls; k++) {
vp->codeword->c[ini + k] = input[i + k];
}
bzero(&(vp->codeword[i].c[k]), (SRSLTE_AVX2_B_SIZE - k) * sizeof(uint8_t));
// this zero padding can be removed
bzero(&(vp->codeword->c[ini + ls]), (node_size - ls) * sizeof(uint8_t));
ini = ini + node_size;
}
bzero(vp->codeword + i, (cdwd_len - msg_len) * sizeof(__m256i));
bzero(vp->codeword + msg_len, (cdwd_len - msg_len) * sizeof(__m256i));
return 0;
}
@ -154,11 +156,12 @@ int return_codeword_avx2(void* p, uint8_t* output, const uint8_t cdwd_len, const
return -1;
}
int k = 0;
for (int i = 0; i < cdwd_len - 2; i++) {
for (k = 0; k < ls; k++) {
output[i * ls + k] = vp->codeword[i + 2].c[k];
int ini = SRSLTE_AVX2_B_SIZE + SRSLTE_AVX2_B_SIZE;
for (int i = 0; i < (cdwd_len - 2) * ls; i = i + ls) {
for (int k = 0; k < ls; k++) {
output[i + k] = vp->codeword->c[ini + k];
}
ini = ini + SRSLTE_AVX2_B_SIZE;
}
return 0;
}

@ -13,7 +13,7 @@
/*!
* \file ldpc_enc_avx2long.c
* \brief Definition of the LDPC encoder inner functions (AVX2 version, large lifting size).
* \author David Gregoratti
* \author David Gregoratti and Jesus Gómez
* \date 2020
*
* \copyright Software Radio Systems Limited
@ -55,6 +55,7 @@ struct ldpc_enc_avx2long {
__m256i* rotated_node_to_free; /*!< \brief Auxiliary pointer to store rotated versions of the nodes with extra free
memory of size SRSLTE_AVX2_B_SIZE previous to rotated_node */
uint8_t n_subnodes; /*!< \brief Number of subnodes. */
uint16_t node_size; /*!< \brief Size of a node in bytes. */
};
/*!
@ -100,6 +101,7 @@ void* create_ldpc_enc_avx2long(srslte_ldpc_encoder_t* q)
}
vp->rotated_node = &vp->rotated_node_to_free[1];
vp->node_size = SRSLTE_AVX2_B_SIZE * vp->n_subnodes;
return vp;
}
@ -123,23 +125,19 @@ int load_avx2long(void* p, const uint8_t* input, const uint8_t msg_len, const ui
return -1;
}
int k = 0;
int j = 0;
int i = 0;
for (; i < msg_len; i++) {
for (j = 0; j < vp->n_subnodes - 1; j++) {
for (k = 0; k < SRSLTE_AVX2_B_SIZE; k++) {
vp->codeword[i * vp->n_subnodes + j].c[k] = input[i * ls + j * SRSLTE_AVX2_B_SIZE + k];
}
int ini = 0;
int node_size = vp->node_size;
for (int i = 0; i < msg_len * ls; i = i + ls) {
for (int k = 0; k < ls; k++) {
vp->codeword->c[ini + k] = input[i + k];
}
// j is now equal to (vp->n_subnodes - 1)
for (k = 0; k < ls - j * SRSLTE_AVX2_B_SIZE; k++) {
vp->codeword[i * vp->n_subnodes + j].c[k] = input[i * ls + j * SRSLTE_AVX2_B_SIZE + k];
}
bzero(&(vp->codeword[i * vp->n_subnodes + j].c[k]), (SRSLTE_AVX2_B_SIZE - k) * sizeof(uint8_t));
// this zero padding can be removed
bzero(&(vp->codeword->c[ini + ls]), (node_size - ls) * sizeof(uint8_t));
ini = ini + node_size;
}
bzero(vp->codeword + i * vp->n_subnodes, (cdwd_len - msg_len) * vp->n_subnodes * sizeof(__m256i));
bzero(vp->codeword + msg_len * vp->n_subnodes, (cdwd_len - msg_len) * vp->n_subnodes * sizeof(__m256i));
return 0;
}
@ -151,18 +149,12 @@ int return_codeword_avx2long(void* p, uint8_t* output, const uint8_t cdwd_len, c
return -1;
}
int k = 0;
int j = 0;
for (int i = 0; i < cdwd_len - 2; i++) {
for (j = 0; j < vp->n_subnodes - 1; j++) {
for (k = 0; k < SRSLTE_AVX2_B_SIZE; k++) {
output[i * ls + j * SRSLTE_AVX2_B_SIZE + k] = vp->codeword[(i + 2) * vp->n_subnodes + j].c[k];
}
}
// j is now equal to vp->n_subndes-1
for (k = 0; k < ls - j * SRSLTE_AVX2_B_SIZE; k++) {
output[i * ls + j * SRSLTE_AVX2_B_SIZE + k] = vp->codeword[(i + 2) * vp->n_subnodes + j].c[k];
int ini = vp->node_size + vp->node_size;
for (int i = 0; i < (cdwd_len - 2) * ls; i = i + ls) {
for (int k = 0; k < ls; k++) {
output[i + k] = vp->codeword->c[ini + k];
}
ini = ini + vp->node_size;
}
return 0;
}

@ -0,0 +1,361 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2020 Software Radio Systems Limited
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
*
*/
/*!
* \file ldpc_enc_avx512.c
* \brief Definition of the LDPC encoder inner functions (AVX512 version, small (<64) lifting size).
* \author Jesus Gomez
* \date 2021
*
* \copyright Software Radio Systems Limited
*
*/
#include <stdint.h>
#include "../utils_avx512.h"
#include "ldpc_enc_all.h"
#include "srslte/phy/fec/ldpc/base_graph.h"
#include "srslte/phy/fec/ldpc/ldpc_encoder.h"
#include "srslte/phy/utils/debug.h"
#include "srslte/phy/utils/vector.h"
#ifdef LV_HAVE_AVX512
#include <immintrin.h>
#include "ldpc_avx512_consts.h"
/*!
* \brief Represents a node of the base factor graph.
*/
typedef union bg_node_avx512_t {
uint8_t c[SRSLTE_AVX512_B_SIZE]; /*!< Each base node may contain up to \ref SRSLTE_AVX512_B_SIZE lifted nodes. */
__m512i v; /*!< All the lifted nodes of the current base node as a 512-bit line. */
} bg_node_avx512_t;
/*!
* \brief Inner registers for the optimized LDPC encoder.
*/
struct ldpc_enc_avx512 {
bg_node_avx512_t* codeword; /*!< \brief Contains the entire codeword, before puncturing. */
bg_node_avx512_t* codeword_to_free; /*!< \brief Auxiliary pointer with a free memory of size SRSLTE_AVX512_B_SIZE
previous to codeword. */
__m512i* aux; /*!< \brief Auxiliary register. */
__m512i* rotated_node; /*!< \brief To store rotated versions of the nodes. */
__m512i* rotated_node_to_free; /*!< \brief Auxiliary pointer to store rotated versions of the nodes with extra free
memory of size SRSLTE_AVX512_B_SIZE previous to rotated_node */
};
/*!
* Rotate the contents of a node towards the right by \b shift chars, that is the
* \b shift * 8 most significant bits become the least significant ones.
* \param[in] mem_addr Address to the node to rotate.
* \param[out] out The rotated node.
* \param[in] shift The order of the rotation in number of chars.
* \param[in] ls The size of the node (lifting size).
*/
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift2, uint16_t ls);
void* create_ldpc_enc_avx512(srslte_ldpc_encoder_t* q)
{
struct ldpc_enc_avx512* vp = NULL;
if ((vp = malloc(sizeof(struct ldpc_enc_avx512))) == NULL) {
return NULL;
}
if ((vp->codeword_to_free = srslte_vec_malloc((q->bgN + 1) * sizeof(bg_node_avx512_t))) == NULL) {
free(vp);
return NULL;
}
vp->codeword = &vp->codeword_to_free[1];
if ((vp->aux = srslte_vec_malloc(q->bgM * sizeof(__m512i))) == NULL) {
free(vp->codeword_to_free);
free(vp);
return NULL;
}
if ((vp->rotated_node_to_free = srslte_vec_malloc((1 + 2) * sizeof(__m512i))) == NULL) {
free(vp->aux);
free(vp->codeword_to_free);
free(vp);
return NULL;
}
vp->rotated_node = &vp->rotated_node_to_free[1];
return vp;
}
void delete_ldpc_enc_avx512(void* p)
{
struct ldpc_enc_avx512* vp = p;
if (vp != NULL) {
free(vp->rotated_node_to_free);
free(vp->aux);
free(vp->codeword_to_free);
free(vp);
}
}
int load_avx512(void* p, const uint8_t* input, const uint8_t msg_len, const uint8_t cdwd_len, const uint16_t ls)
{
struct ldpc_enc_avx512* vp = p;
if (p == NULL) {
return -1;
}
int i = 0;
int k = 0;
for (; i < msg_len; i++) {
for (k = 0; k < ls; k++) {
vp->codeword[i].c[k] = input[i * ls + k];
}
// This zero padding might be remove
bzero(&(vp->codeword[i].c[k]), (SRSLTE_AVX512_B_SIZE - k) * sizeof(uint8_t));
}
bzero(vp->codeword + i, (cdwd_len - msg_len) * sizeof(__m512i));
return 0;
}
int return_codeword_avx512(void* p, uint8_t* output, const uint8_t cdwd_len, const uint16_t ls)
{
struct ldpc_enc_avx512* vp = p;
if (p == NULL) {
return -1;
}
int k = 0;
for (int i = 0; i < cdwd_len - 2; i++) {
for (k = 0; k < ls; k++) {
output[i * ls + k] = vp->codeword[i + 2].c[k];
}
}
return 0;
}
void encode_ext_region_avx512(srslte_ldpc_encoder_t* q, uint8_t n_layers)
{
struct ldpc_enc_avx512* vp = q->ptr;
int m = 0;
int skip = 0;
int k = 0;
uint16_t* this_shift = NULL;
__m512i tmp_epi8;
// Encode the extended region. In case of puncturing or IR-HARQ, we could focus on
// specific check nodes instead of processing all of them from m = 4 to m = M - 1.
for (m = 4; m < n_layers; m++) {
skip = q->bgK + m;
// the systematic part has already been computed
vp->codeword[skip].v = vp->aux[m];
// sum the contribution due to the high-rate region, with the proper circular shifts
for (k = 0; k < 4; k++) {
this_shift = q->pcm + q->bgK + k + m * q->bgN;
if (*this_shift != NO_CNCT) {
rotate_node_right(vp->codeword[q->bgK + k].c, &tmp_epi8, *this_shift, q->ls);
vp->codeword[skip].v = _mm512_xor_si512(vp->codeword[skip].v, tmp_epi8);
}
}
}
}
void preprocess_systematic_bits_avx512(srslte_ldpc_encoder_t* q)
{
struct ldpc_enc_avx512* vp = q->ptr;
int N = q->bgN;
int K = q->bgK;
int M = q->bgM;
int ls = q->ls;
uint16_t* pcm = q->pcm;
int k = 0;
int m = 0;
uint16_t* this_shift = NULL;
__m512i tmp_epi8;
bzero(vp->aux, M * sizeof(__m512i));
// split the input message into K chunks of ls bits each and, for all chunks
for (k = 0; k < K; k++) {
// for all check nodes
// NB: if looking for performance you can do the following loop only over the high-rate
// region of the PCM (m=0,1,2,3) and over the check nodes that result in a transmitted
// coded bit after puncturing or IR-HARQ (see Deliverable D1 Section 3.4).
for (m = 0; m < M; m++) {
// entry of pcm corresponding to the current input chunk and the current check node
this_shift = pcm + k + m * N;
// xor array aux[m] with a circularly shifted version of the current input chunk, unless
// the current check node and variable node are not connected.
if (*this_shift != NO_CNCT) {
rotate_node_right(vp->codeword[k].c, &tmp_epi8, *this_shift, ls);
tmp_epi8 = _mm512_and_si512(tmp_epi8, _mm512_one_epi8);
vp->aux[m] = _mm512_xor_si512(vp->aux[m], tmp_epi8);
}
}
}
}
void encode_high_rate_case1_avx512(void* o)
{
srslte_ldpc_encoder_t* q = o;
struct ldpc_enc_avx512* vp = q->ptr;
int ls = q->ls;
int skip0 = q->bgK;
int skip1 = q->bgK + 1;
int skip2 = q->bgK + 2;
int skip3 = q->bgK + 3;
// first chunk of parity bits
vp->codeword[skip0].v = _mm512_xor_si512(vp->aux[0], vp->aux[1]);
vp->codeword[skip0].v = _mm512_xor_si512(vp->codeword[skip0].v, vp->aux[2]);
vp->codeword[skip0].v = _mm512_xor_si512(vp->codeword[skip0].v, vp->aux[3]);
__m512i tmp_epi8;
rotate_node_right(vp->codeword[skip0].c, &tmp_epi8, 1, ls);
// second chunk of parity bits
vp->codeword[skip1].v = _mm512_xor_si512(vp->aux[0], tmp_epi8);
// fourth chunk of parity bits
vp->codeword[skip3].v = _mm512_xor_si512(vp->aux[3], tmp_epi8);
// third chunk of parity bits
vp->codeword[skip2].v = _mm512_xor_si512(vp->aux[2], vp->codeword[skip3].v);
}
void encode_high_rate_case2_avx512(void* o)
{
srslte_ldpc_encoder_t* q = o;
struct ldpc_enc_avx512* vp = q->ptr;
int ls = q->ls;
int skip0 = q->bgK;
int skip1 = q->bgK + 1;
int skip2 = q->bgK + 2;
int skip3 = q->bgK + 3;
// first chunk of parity bits
__m512i* tmp_epi8 = vp->rotated_node;
*tmp_epi8 = _mm512_xor_si512(vp->aux[0], vp->aux[1]);
*tmp_epi8 = _mm512_xor_si512(*tmp_epi8, vp->aux[2]);
*tmp_epi8 = _mm512_xor_si512(*tmp_epi8, vp->aux[3]);
rotate_node_right((uint8_t*)tmp_epi8, &(vp->codeword[skip0].v), ls - 105 % ls, ls);
// second chunk of parity bits
vp->codeword[skip1].v = _mm512_xor_si512(vp->aux[0], vp->codeword[skip0].v);
// fourth chunk of parity bits
vp->codeword[skip3].v = _mm512_xor_si512(vp->aux[3], vp->codeword[skip0].v);
// third chunk of parity bits
vp->codeword[skip2].v = _mm512_xor_si512(vp->aux[2], vp->codeword[skip3].v);
}
void encode_high_rate_case3_avx512(void* o)
{
srslte_ldpc_encoder_t* q = o;
struct ldpc_enc_avx512* vp = q->ptr;
int ls = q->ls;
int skip0 = q->bgK;
int skip1 = q->bgK + 1;
int skip2 = q->bgK + 2;
int skip3 = q->bgK + 3;
// first chunk of parity bits
__m512i* tmp_epi8 = vp->rotated_node;
*tmp_epi8 = _mm512_xor_si512(vp->aux[0], vp->aux[1]);
*tmp_epi8 = _mm512_xor_si512(*tmp_epi8, vp->aux[2]);
*tmp_epi8 = _mm512_xor_si512(*tmp_epi8, vp->aux[3]);
rotate_node_right((uint8_t*)tmp_epi8, &(vp->codeword[skip0].v), ls - 1, ls);
// second chunk of parity bits
vp->codeword[skip1].v = _mm512_xor_si512(vp->aux[0], vp->codeword[skip0].v);
// third chunk of parity bits
vp->codeword[skip2].v = _mm512_xor_si512(vp->aux[1], vp->codeword[skip1].v);
// fourth chunk of parity bits
vp->codeword[skip3].v = _mm512_xor_si512(vp->aux[3], vp->codeword[skip0].v);
}
void encode_high_rate_case4_avx512(void* o)
{
srslte_ldpc_encoder_t* q = o;
struct ldpc_enc_avx512* vp = q->ptr;
int ls = q->ls;
int skip0 = q->bgK;
int skip1 = q->bgK + 1;
int skip2 = q->bgK + 2;
int skip3 = q->bgK + 3;
// first chunk of parity bits
vp->codeword[skip0].v = _mm512_xor_si512(vp->aux[0], vp->aux[1]);
vp->codeword[skip0].v = _mm512_xor_si512(vp->codeword[skip0].v, vp->aux[2]);
vp->codeword[skip0].v = _mm512_xor_si512(vp->codeword[skip0].v, vp->aux[3]);
__m512i tmp_epi8;
rotate_node_right(vp->codeword[skip0].c, &tmp_epi8, 1, ls);
// second chunk of parity bits
vp->codeword[skip1].v = _mm512_xor_si512(vp->aux[0], tmp_epi8);
// third chunk of parity bits
vp->codeword[skip2].v = _mm512_xor_si512(vp->aux[1], vp->codeword[skip1].v);
// fourth chunk of parity bits
vp->codeword[skip3].v = _mm512_xor_si512(vp->aux[3], tmp_epi8);
}
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift2, uint16_t ls)
{
const __m512i MZERO = _mm512_set1_epi8(0);
uint16_t shift = 0;
uint16_t _shift = 0;
uint64_t mask1 = 0;
uint64_t mask2 = 0;
if (this_shift2 == 0) {
out[0] = _mm512_loadu_si512(mem_addr);
} else { // if the last is broken, take _shift bits from the end and "shift" bits from the begin.
_shift = ls - this_shift2;
shift = SRSLTE_AVX512_B_SIZE - _shift;
mask1 = (1ULL << _shift) - 1; // i.e. 000001111 _shift =4
mask2 = (1ULL << shift) - 1;
mask2 = mask2 << _shift; // i.e. 000110000 shift = 2, _shift = 4
out[0] = _mm512_mask_loadu_epi8(MZERO, mask1, mem_addr + this_shift2);
out[0] = _mm512_mask_loadu_epi8(out[0], mask2, mem_addr - _shift);
}
}
#endif // LV_HAVE_AVX512

@ -0,0 +1,407 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2020 Software Radio Systems Limited
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
*
*/
/*!
* \file ldpc_enc_avx512long.c
* \brief Definition of the LDPC encoder inner functions (AVX512 version, large (>64) lifting size).
* \author Jesus Gomez
* \date 2021
*
* \copyright Software Radio Systems Limited
*
*/
#include <stdint.h>
#include "../utils_avx512.h"
#include "ldpc_enc_all.h"
#include "srslte/phy/fec/ldpc/base_graph.h"
#include "srslte/phy/fec/ldpc/ldpc_encoder.h"
#include "srslte/phy/utils/debug.h"
#include "srslte/phy/utils/vector.h"
#ifdef LV_HAVE_AVX512
#include <immintrin.h>
#include "ldpc_avx512_consts.h"
/*!
* \brief Represents a node of the base factor graph.
*/
typedef union bg_node_avx512_t {
uint8_t c[SRSLTE_AVX512_B_SIZE]; /*!< Each base node may contain up to \ref SRSLTE_AVX512_B_SIZE lifted nodes. */
__m512i v; /*!< All the lifted nodes of the current base node as a 512-bit line. */
} bg_node_avx512_t;
/*!
* \brief Inner registers for the optimized LDPC encoder.
*/
struct ldpc_enc_avx512long {
bg_node_avx512_t* codeword; /*!< \brief Contains the entire codeword, before puncturing. */
bg_node_avx512_t* codeword_to_free; /*!< \brief Auxiliary pointer with a free memory of size SRSLTE_AVX512_B_SIZE
previous to codeword. */
__m512i* aux; /*!< \brief Auxiliary register. */
__m512i* rotated_node; /*!< \brief To store rotated versions of the nodes. */
__m512i* rotated_node_to_free; /*!< \brief Auxiliary pointer to store rotated versions of the nodes with extra free
memory of size SRSLTE_AVX512_B_SIZE previous to rotated_node */
uint8_t n_subnodes; /*!< \brief Number of subnodes. */
uint16_t node_size; /*!> \brief Size of a node in bytes. */
};
/*!
* Rotate the contents of a node towards the right by \b shift chars, that is the
* \b shift * 8 most significant bits become the least significant ones.
* \param[in] mem_addr The address to the node to rotate.
* \param[out] out The rotated node.
* \param[in] shift The order of the rotation in number of chars.
* \param[in] ls The size of the node (lifting size).
*/
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift2, uint16_t ls);
void* create_ldpc_enc_avx512long(srslte_ldpc_encoder_t* q)
{
struct ldpc_enc_avx512long* vp = NULL;
if ((vp = malloc(sizeof(struct ldpc_enc_avx512long))) == NULL) {
return NULL;
}
int left_out = q->ls % SRSLTE_AVX512_B_SIZE;
vp->n_subnodes = q->ls / SRSLTE_AVX512_B_SIZE + (left_out > 0);
if ((vp->codeword_to_free = srslte_vec_malloc((q->bgN * vp->n_subnodes + 1) * sizeof(bg_node_avx512_t))) == NULL) {
free(vp);
return NULL;
}
vp->codeword = &vp->codeword_to_free[1];
if ((vp->aux = srslte_vec_malloc(q->bgM * vp->n_subnodes * sizeof(__m512i))) == NULL) {
free(vp->codeword_to_free);
free(vp);
return NULL;
}
if ((vp->rotated_node_to_free = srslte_vec_malloc((vp->n_subnodes + 2) * sizeof(__m512i))) == NULL) {
free(vp->aux);
free(vp->codeword_to_free);
free(vp);
return NULL;
}
vp->rotated_node = &vp->rotated_node_to_free[1];
vp->node_size = SRSLTE_AVX512_B_SIZE * vp->n_subnodes;
return vp;
}
void delete_ldpc_enc_avx512long(void* p)
{
struct ldpc_enc_avx512long* vp = p;
if (vp != NULL) {
free(vp->rotated_node_to_free);
free(vp->aux);
free(vp->codeword_to_free);
free(vp);
}
}
int load_avx512long(void* p, const uint8_t* input, const uint8_t msg_len, const uint8_t cdwd_len, const uint16_t ls)
{
struct ldpc_enc_avx512long* vp = p;
if (p == NULL) {
return -1;
}
int ini = 0;
int node_size = vp->node_size;
for (int i = 0; i < msg_len * ls; i = i + ls) {
for (int k = 0; k < ls; k++) {
vp->codeword->c[ini + k] = input[i + k];
}
// this zero padding is not really necessary
bzero(&(vp->codeword->c[ini + ls]), (node_size - ls) * sizeof(uint8_t));
ini = ini + node_size;
}
bzero(vp->codeword + msg_len * vp->n_subnodes, (cdwd_len - msg_len) * vp->n_subnodes * sizeof(__m512i));
return 0;
}
int return_codeword_avx512long(void* p, uint8_t* output, const uint8_t cdwd_len, const uint16_t ls)
{
struct ldpc_enc_avx512long* vp = p;
if (p == NULL) {
return -1;
}
int ini = vp->node_size + vp->node_size;
for (int i = 0; i < (cdwd_len - 2) * ls; i = i + ls) {
for (int k = 0; k < ls; k++) {
output[i + k] = vp->codeword->c[ini + k];
}
ini = ini + vp->node_size;
}
return 0;
}
void encode_ext_region_avx512long(srslte_ldpc_encoder_t* q, uint8_t n_layers)
{
struct ldpc_enc_avx512long* vp = q->ptr;
int m = 0;
int skip = 0;
int k = 0;
int j = 0;
uint16_t* this_shift = NULL;
// Encode the extended region. In case of puncturing or IR-HARQ, we could focus on
// specific check nodes instead of processing all of them from m = 4 to m = M - 1.
for (m = 4; m < n_layers; m++) {
skip = (q->bgK + m) * vp->n_subnodes;
// the systematic part has already been computed
for (j = 0; j < vp->n_subnodes; j++) {
vp->codeword[skip + j].v = vp->aux[m * vp->n_subnodes + j];
}
// sum the contribution due to the high-rate region, with the proper circular shifts
for (k = 0; k < 4; k++) {
this_shift = q->pcm + q->bgK + k + m * q->bgN;
// xor array aux[m] with a circularly shifted version of the current input chunk, unless
// the current check node and variable node are not connected.
if (*this_shift != NO_CNCT) {
rotate_node_right(vp->codeword[(q->bgK + k) * vp->n_subnodes].c, vp->rotated_node, *this_shift, q->ls);
for (j = 0; j < vp->n_subnodes; j++) {
vp->codeword[skip + j].v = _mm512_xor_si512(vp->codeword[skip + j].v, vp->rotated_node[j]);
}
}
}
}
}
void preprocess_systematic_bits_avx512long(srslte_ldpc_encoder_t* q)
{
struct ldpc_enc_avx512long* vp = q->ptr;
int N = q->bgN;
int K = q->bgK;
int M = q->bgM;
int ls = q->ls;
uint16_t* pcm = q->pcm;
int k = 0;
int m = 0;
int j = 0;
uint16_t* this_shift = NULL;
__m512i tmp_epi8_avx512;
bzero(vp->aux, M * vp->n_subnodes * sizeof(__m512i));
// split the input message into K chunks of ls bits each and, for all chunks
for (k = 0; k < K; k++) {
// for all check nodes
// NB: if looking for performance you can do the following loop only over the high-rate
// region of the PCM (m=0,1,2,3) and over the check nodes that result in a transmitted
// coded bit after puncturing or IR-HARQ (see Deliverable D1 Section 3.4).
for (m = 0; m < M; m++) {
// entry of pcm corresponding to the current input chunk and the current check node
this_shift = pcm + k + m * N;
// xor array aux[m] with a circularly shifted version of the current input chunk, unless
// the current check node and variable node are not connected.
if (*this_shift != NO_CNCT) {
rotate_node_right(vp->codeword[k * vp->n_subnodes].c, vp->rotated_node, *this_shift, ls);
for (j = 0; j < vp->n_subnodes; j++) {
tmp_epi8_avx512 = _mm512_and_si512(vp->rotated_node[j], _mm512_one_epi8);
vp->aux[m * vp->n_subnodes + j] = _mm512_xor_si512(vp->aux[m * vp->n_subnodes + j], tmp_epi8_avx512);
}
}
} // m
} // k
}
void encode_high_rate_case1_avx512long(void* o)
{
srslte_ldpc_encoder_t* q = o;
struct ldpc_enc_avx512long* vp = q->ptr;
int ls = q->ls;
int j = 0;
int skip0 = q->bgK * vp->n_subnodes;
int skip1 = (q->bgK + 1) * vp->n_subnodes;
int skip2 = (q->bgK + 2) * vp->n_subnodes;
int skip3 = (q->bgK + 3) * vp->n_subnodes;
// first chunk of parity bits
for (j = 0; j < vp->n_subnodes; j++) {
vp->codeword[skip0 + j].v = _mm512_xor_si512(vp->aux[j], vp->aux[vp->n_subnodes + j]);
vp->codeword[skip0 + j].v = _mm512_xor_si512(vp->codeword[skip0 + j].v, vp->aux[2 * vp->n_subnodes + j]);
vp->codeword[skip0 + j].v = _mm512_xor_si512(vp->codeword[skip0 + j].v, vp->aux[3 * vp->n_subnodes + j]);
}
rotate_node_right(vp->codeword[skip0].c, vp->rotated_node, 1, ls);
for (j = 0; j < vp->n_subnodes; j++) {
// second chunk of parity bits
vp->codeword[skip1 + j].v = _mm512_xor_si512(vp->aux[j], vp->rotated_node[j]);
// fourth chunk of parity bits
vp->codeword[skip3 + j].v = _mm512_xor_si512(vp->aux[3 * vp->n_subnodes + j], vp->rotated_node[j]);
// third chunk of parity bits
vp->codeword[skip2 + j].v = _mm512_xor_si512(vp->aux[2 * vp->n_subnodes + j], vp->codeword[skip3 + j].v);
}
}
void encode_high_rate_case2_avx512long(void* o)
{
srslte_ldpc_encoder_t* q = o;
struct ldpc_enc_avx512long* vp = q->ptr;
int ls = q->ls;
int j = 0;
int skip0 = q->bgK * vp->n_subnodes;
int skip1 = (q->bgK + 1) * vp->n_subnodes;
int skip2 = (q->bgK + 2) * vp->n_subnodes;
int skip3 = (q->bgK + 3) * vp->n_subnodes;
// first chunk of parity bits
for (j = 0; j < vp->n_subnodes; j++) {
vp->rotated_node[j] = _mm512_xor_si512(vp->aux[j], vp->aux[vp->n_subnodes + j]);
vp->rotated_node[j] = _mm512_xor_si512(vp->rotated_node[j], vp->aux[2 * vp->n_subnodes + j]);
vp->rotated_node[j] = _mm512_xor_si512(vp->rotated_node[j], vp->aux[3 * vp->n_subnodes + j]);
}
rotate_node_right((uint8_t*)vp->rotated_node, &(vp->codeword[skip0].v), ls - 105 % ls, ls);
for (j = 0; j < vp->n_subnodes; j++) {
// second chunk of parity bits
vp->codeword[skip1 + j].v = _mm512_xor_si512(vp->aux[j], vp->codeword[skip0 + j].v);
// fourth chunk of parity bits
vp->codeword[skip3 + j].v = _mm512_xor_si512(vp->aux[3 * vp->n_subnodes + j], vp->codeword[skip0 + j].v);
// third chunk of parity bits
vp->codeword[skip2 + j].v = _mm512_xor_si512(vp->aux[2 * vp->n_subnodes + j], vp->codeword[skip3 + j].v);
}
}
void encode_high_rate_case3_avx512long(void* o)
{
srslte_ldpc_encoder_t* q = o;
struct ldpc_enc_avx512long* vp = q->ptr;
int ls = q->ls;
int j = 0;
int skip0 = q->bgK * vp->n_subnodes;
int skip1 = (q->bgK + 1) * vp->n_subnodes;
int skip2 = (q->bgK + 2) * vp->n_subnodes;
int skip3 = (q->bgK + 3) * vp->n_subnodes;
// first chunk of parity bits
for (j = 0; j < vp->n_subnodes; j++) {
vp->rotated_node[j] = _mm512_xor_si512(vp->aux[j], vp->aux[vp->n_subnodes + j]);
vp->rotated_node[j] = _mm512_xor_si512(vp->rotated_node[j], vp->aux[2 * vp->n_subnodes + j]);
vp->rotated_node[j] = _mm512_xor_si512(vp->rotated_node[j], vp->aux[3 * vp->n_subnodes + j]);
}
rotate_node_right((uint8_t*)vp->rotated_node, &(vp->codeword[skip0].v), ls - 1, ls);
for (j = 0; j < vp->n_subnodes; j++) {
// second chunk of parity bits
vp->codeword[skip1 + j].v = _mm512_xor_si512(vp->aux[j], vp->codeword[skip0 + j].v);
// third chunk of parity bits
vp->codeword[skip2 + j].v = _mm512_xor_si512(vp->aux[vp->n_subnodes + j], vp->codeword[skip1 + j].v);
// fourth chunk of parity bits
vp->codeword[skip3 + j].v = _mm512_xor_si512(vp->aux[3 * vp->n_subnodes + j], vp->codeword[skip0 + j].v);
}
}
void encode_high_rate_case4_avx512long(void* o)
{
srslte_ldpc_encoder_t* q = o;
struct ldpc_enc_avx512long* vp = q->ptr;
int ls = q->ls;
int j = 0;
int skip0 = q->bgK * vp->n_subnodes;
int skip1 = (q->bgK + 1) * vp->n_subnodes;
int skip2 = (q->bgK + 2) * vp->n_subnodes;
int skip3 = (q->bgK + 3) * vp->n_subnodes;
// first chunk of parity bits
for (j = 0; j < vp->n_subnodes; j++) {
vp->codeword[skip0 + j].v = _mm512_xor_si512(vp->aux[j], vp->aux[vp->n_subnodes + j]);
vp->codeword[skip0 + j].v = _mm512_xor_si512(vp->codeword[skip0 + j].v, vp->aux[2 * vp->n_subnodes + j]);
vp->codeword[skip0 + j].v = _mm512_xor_si512(vp->codeword[skip0 + j].v, vp->aux[3 * vp->n_subnodes + j]);
}
rotate_node_right(vp->codeword[skip0].c, vp->rotated_node, 1, ls);
for (j = 0; j < vp->n_subnodes; j++) {
// second chunk of parity bits
vp->codeword[skip1 + j].v = _mm512_xor_si512(vp->aux[j], vp->rotated_node[j]);
// third chunk of parity bits
vp->codeword[skip2 + j].v = _mm512_xor_si512(vp->aux[vp->n_subnodes + j], vp->codeword[skip1 + j].v);
// fourth chunk of parity bits
vp->codeword[skip3 + j].v = _mm512_xor_si512(vp->aux[3 * vp->n_subnodes + j], vp->rotated_node[j]);
}
}
static void rotate_node_right(const uint8_t* mem_addr, __m512i* out, uint16_t this_shift2, uint16_t ls)
{
uint16_t shift = 0;
uint16_t _shift = 0;
uint64_t mask1 = 0;
uint64_t mask2 = 0;
const __m512i MZERO = _mm512_set1_epi8(0);
// the part in the middle - we simply copy.
int j = 0;
int jj = 0;
// copy full avx512 registers from this_shift_2
for (j = this_shift2; j <= ls - SRSLTE_AVX512_B_SIZE; j = j + SRSLTE_AVX512_B_SIZE) {
out[jj] = _mm512_loadu_si512(mem_addr + j);
jj = jj + 1;
}
// if the last is broken, take _shift bits from the end and "shift" bits from the begin.
if (ls > j) {
_shift = ls - j;
shift = SRSLTE_AVX512_B_SIZE - _shift;
mask1 = (1ULL << _shift) - 1; // i.e. 000001111 _shift =4
mask2 = (1ULL << shift) - 1;
mask2 = mask2 << _shift; // i.e. 000110000 shift = 2, _shift = 4
out[jj] = _mm512_mask_loadu_epi8(MZERO, mask1, mem_addr + j);
out[jj] = _mm512_mask_loadu_epi8(out[jj], mask2, mem_addr - _shift);
jj = jj + 1;
}
// copy full avx512 registers from the start of mem_addr, fill with zeros after out is filled with ls
for (j = shift; j < this_shift2; j = j + SRSLTE_AVX512_B_SIZE) {
out[jj] = _mm512_loadu_si512(mem_addr + j); // the exes is filled with something arbitrary
jj = jj + 1;
}
}
#endif // LV_HAVE_AVX512

@ -13,7 +13,7 @@
/*!
* \file ldpc_encoder.c
* \brief Definition of the LDPC encoder.
* \author David Gregoratti
* \author David Gregoratti and Jesus Gomez
* \date 2020
*
* \copyright Software Radio Systems Limited
@ -23,6 +23,7 @@
#include <stdint.h>
#include "../utils_avx2.h"
#include "../utils_avx512.h"
#include "ldpc_enc_all.h"
#include "srslte/phy/fec/ldpc/base_graph.h"
#include "srslte/phy/fec/ldpc/ldpc_encoder.h"
@ -310,6 +311,194 @@ static int init_avx2long(srslte_ldpc_encoder_t* q)
#endif
#ifdef LV_HAVE_AVX512
/*! Carries out the actual destruction of the memory allocated to the encoder. */
static void free_enc_avx512(void* o)
{
srslte_ldpc_encoder_t* q = o;
if (q->pcm) {
free(q->pcm);
}
if (q->ptr) {
delete_ldpc_enc_avx512(q->ptr);
}
}
/*! Carries out the actual encoding with an optimized encoder. */
static int encode_avx512(void* o, const uint8_t* input, uint8_t* output, uint32_t input_length, uint32_t cdwd_rm_length)
{
srslte_ldpc_encoder_t* q = o;
if (input_length / q->bgK != q->ls) {
perror("Dimension mismatch.\n");
return -1;
}
// it must be smaller than the codeword size
if (cdwd_rm_length > q->liftN - 2 * q->ls) {
cdwd_rm_length = q->liftN - 2 * q->ls;
}
// We need at least q->bgK + 4 variable nodes to cover the high-rate region. However,
// 2 variable nodes are systematically punctured by the encoder.
if (cdwd_rm_length < (q->bgK + 2) * q->ls) {
// ERROR("The rate-matched codeword should have a length at least equal to the high-rate region.");
cdwd_rm_length = (q->bgK + 2) * q->ls;
// return -1;
}
if (cdwd_rm_length % q->ls) {
cdwd_rm_length = (cdwd_rm_length / q->ls + 1) * q->ls;
// ERROR("The rate-matched codeword length should be a multiple of the lifting size.");
// return -1;
}
load_avx512(q->ptr, input, q->bgK, q->bgN, q->ls);
preprocess_systematic_bits_avx512(q);
q->encode_high_rate_avx512(q);
// When computing the number of layers, we need to recall that the standard always removes
// the first two variable nodes from the final codeword.
uint8_t n_layers = cdwd_rm_length / q->ls - q->bgK + 2;
encode_ext_region_avx512(q, n_layers);
return_codeword_avx512(q->ptr, output, n_layers + q->bgK, q->ls);
return 0;
}
/*! Initializes an optimized encoder. */
static int init_avx512(srslte_ldpc_encoder_t* q)
{
int ls_index = get_ls_index(q->ls);
if (ls_index == VOID_LIFTSIZE) {
ERROR("Invalid lifting size %d\n", q->ls);
return -1;
}
if (q->bg == BG1 && ls_index != 6) {
q->encode_high_rate_avx512 = encode_high_rate_case1_avx512;
} else if (q->bg == BG1 && ls_index == 6) {
q->encode_high_rate_avx512 = encode_high_rate_case2_avx512;
} else if (q->bg == BG2 && ls_index != 3 && ls_index != 7) {
q->encode_high_rate_avx512 = encode_high_rate_case3_avx512;
} else if (q->bg == BG2 && (ls_index == 3 || ls_index == 7)) {
q->encode_high_rate_avx512 = encode_high_rate_case4_avx512;
} else {
ERROR("Invalid lifting size %d and/or Base Graph %d\n", q->ls, q->bg + 1);
return -1;
}
q->free = free_enc_avx512;
if ((q->ptr = create_ldpc_enc_avx512(q)) == NULL) {
perror("Create_ldpc_enc\n");
free_enc_avx512(q);
return -1;
}
q->encode = encode_avx512;
return 0;
}
/*! Carries out the actual destruction of the memory allocated to the encoder. */
static void free_enc_avx512long(void* o)
{
srslte_ldpc_encoder_t* q = o;
if (q->pcm) {
free(q->pcm);
}
if (q->ptr) {
delete_ldpc_enc_avx512long(q->ptr);
}
}
/*! Carries out the actual encoding with an optimized encoder. */
static int
encode_avx512long(void* o, const uint8_t* input, uint8_t* output, uint32_t input_length, uint32_t cdwd_rm_length)
{
srslte_ldpc_encoder_t* q = o;
if (input_length / q->bgK != q->ls) {
perror("Dimension mismatch.\n");
return -1;
}
// it must be smaller than the codeword size
if (cdwd_rm_length > q->liftN - 2 * q->ls) {
cdwd_rm_length = q->liftN - 2 * q->ls;
}
// We need at least q->bgK + 4 variable nodes to cover the high-rate region. However,
// 2 variable nodes are systematically punctured by the encoder.
if (cdwd_rm_length < (q->bgK + 2) * q->ls) {
// ERROR("The rate-matched codeword should have a length at least equal to the high-rate region.\n");
cdwd_rm_length = (q->bgK + 2) * q->ls;
// return -1;
}
if (cdwd_rm_length % q->ls) {
cdwd_rm_length = (cdwd_rm_length / q->ls + 1) * q->ls;
// ERROR("The rate-matched codeword length should be a multiple of the lifting size.\n");
// return -1;
}
load_avx512long(q->ptr, input, q->bgK, q->bgN, q->ls);
preprocess_systematic_bits_avx512long(q);
q->encode_high_rate_avx512(q);
// When computing the number of layers, we need to recall that the standard always removes
// the first two variable nodes from the final codeword.
uint8_t n_layers = cdwd_rm_length / q->ls - q->bgK + 2;
encode_ext_region_avx512long(q, n_layers);
return_codeword_avx512long(q->ptr, output, n_layers + q->bgK, q->ls);
return 0;
}
/*! Initializes an optimized encoder. */
static int init_avx512long(srslte_ldpc_encoder_t* q)
{
int ls_index = get_ls_index(q->ls);
if (ls_index == VOID_LIFTSIZE) {
ERROR("Invalid lifting size %d\n", q->ls);
return -1;
}
if (q->bg == BG1 && ls_index != 6) {
q->encode_high_rate_avx512 = encode_high_rate_case1_avx512long;
} else if (q->bg == BG1 && ls_index == 6) {
q->encode_high_rate_avx512 = encode_high_rate_case2_avx512long;
} else if (q->bg == BG2 && ls_index != 3 && ls_index != 7) {
q->encode_high_rate_avx512 = encode_high_rate_case3_avx512long;
} else if (q->bg == BG2 && (ls_index == 3 || ls_index == 7)) {
q->encode_high_rate_avx512 = encode_high_rate_case4_avx512long;
} else {
ERROR("Invalid lifting size %d and/or Base Graph %d\n", q->ls, q->bg + 1);
return -1;
}
q->free = free_enc_avx512long;
if ((q->ptr = create_ldpc_enc_avx512long(q)) == NULL) {
perror("Create_ldpc_enc\n");
free_enc_avx512long(q);
return -1;
}
q->encode = encode_avx512long;
return 0;
}
#endif
int srslte_ldpc_encoder_init(srslte_ldpc_encoder_t* q,
srslte_ldpc_encoder_type_t type,
srslte_basegraph_t bg,
@ -357,6 +546,14 @@ int srslte_ldpc_encoder_init(srslte_ldpc_encoder_t* q,
return init_avx2long(q);
}
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
case SRSLTE_LDPC_ENCODER_AVX512:
if (ls <= SRSLTE_AVX512_B_SIZE) {
return init_avx512(q);
} else {
return init_avx512long(q);
}
#endif // LV_HAVE_AVX512
default:
return -1;
}

@ -37,6 +37,14 @@ if(HAVE_AVX2)
target_link_libraries(ldpc_dec_avx2_test srslte_phy)
endif(HAVE_AVX2)
if(HAVE_AVX512)
add_executable(ldpc_enc_avx512_test ldpc_enc_avx512_test.c)
target_link_libraries(ldpc_enc_avx512_test srslte_phy)
add_executable(ldpc_dec_avx512_test ldpc_dec_avx512_test.c)
target_link_libraries(ldpc_dec_avx512_test srslte_phy)
endif(HAVE_AVX512)
### Test LDPC libs
function(ldpc_unit_tests)
foreach(i IN LISTS ARGN)
@ -73,7 +81,56 @@ set(test_name LDPC-DEC-BG2)
set(test_command ldpc_dec_test -b2)
ldpc_unit_tests(${lifting_sizes})
add_nr_test(NAME LDPC-chain COMMAND ldpc_chain_test)
if (HAVE_AVX2)
set(test_name LDPC-ENC-AVX2-BG1)
set(test_command ldpc_enc_avx2_test -b1)
ldpc_unit_tests(${lifting_sizes})
set(test_name LDPC-ENC-AVX2-BG2)
set(test_command ldpc_enc_avx2_test -b2)
ldpc_unit_tests(${lifting_sizes})
set(test_name LDPC-DEC-AVX2-BG1)
set(test_command ldpc_dec_avx2_test -b1)
ldpc_unit_tests(${lifting_sizes})
set(test_name LDPC-DEC-AVX2-BG2)
set(test_command ldpc_enc_avx2_test -b2)
ldpc_unit_tests(${lifting_sizes})
endif (HAVE_AVX2)
if (HAVE_AVX512)
set(test_name LDPC-ENC-AVX512-BG1)
set(test_command ldpc_enc_avx512_test -b1)
ldpc_unit_tests(${lifting_sizes})
set(test_name LDPC-ENC-AVX512-BG2)
set(test_command ldpc_enc_avx512_test -b2)
ldpc_unit_tests(${lifting_sizes})
set(test_name LDPC-DEC-AVX512-BG1)
set(test_command ldpc_dec_avx512_test -b1)
ldpc_unit_tests(${lifting_sizes})
set(test_name LDPC-DEC-AVX512-BG2)
set(test_command ldpc_dec_avx512_test -b2)
ldpc_unit_tests(${lifting_sizes})
set(test_name LDPC-DEC-AVX512-FLOOD-BG1)
set(test_command ldpc_dec_avx512_test -x1 -b1)
ldpc_unit_tests(${lifting_sizes})
set(test_name LDPC-DEC-AVX512-FLOOD-BG2)
set(test_command ldpc_dec_avx512_test -x1 -b2)
ldpc_unit_tests(${lifting_sizes})
endif (HAVE_AVX512)
add_test(NAME LDPC-chain COMMAND ldpc_chain_test)
### Test LDPC Rate Matching UNIT tests
set(mod_order

@ -134,6 +134,8 @@ int main(int argc, char** argv)
uint8_t* messages_sim_c_flood = NULL;
uint8_t* messages_sim_avx = NULL;
uint8_t* messages_sim_avx_flood = NULL;
uint8_t* messages_sim_avx512 = NULL;
uint8_t* messages_sim_avx512_flood = NULL;
uint8_t* codewords = NULL;
float* symbols_rm = NULL;
float* symbols = NULL;
@ -147,6 +149,12 @@ int main(int argc, char** argv)
// create an LDPC encoder
srslte_ldpc_encoder_t encoder;
#ifdef LV_HAVE_AVX512
if (srslte_ldpc_encoder_init(&encoder, SRSLTE_LDPC_ENCODER_AVX512, base_graph, lift_size) != 0) {
perror("encoder init");
exit(-1);
}
#else
#ifdef LV_HAVE_AVX2
if (srslte_ldpc_encoder_init(&encoder, SRSLTE_LDPC_ENCODER_AVX2, base_graph, lift_size) != 0) {
perror("encoder init");
@ -158,6 +166,7 @@ int main(int argc, char** argv)
exit(-1);
}
#endif // LV_HAVE_AVX2
#endif // LV_HAVE_AVX512
// create an LDPC decoder (float)
srslte_ldpc_decoder_t decoder_f;
@ -200,11 +209,30 @@ int main(int argc, char** argv)
}
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
// create an LDPC decoder (8 bit, AVX512 version)
srslte_ldpc_decoder_t decoder_avx512;
if (srslte_ldpc_decoder_init(&decoder_avx512, SRSLTE_LDPC_DECODER_C_AVX512, base_graph, lift_size, MS_SF) != 0) {
perror("decoder init");
exit(-1);
}
// create an LDPC decoder (8 bit, flooded scheduling, AVX512 version)
srslte_ldpc_decoder_t decoder_avx512_flood;
if (srslte_ldpc_decoder_init(
&decoder_avx512_flood, SRSLTE_LDPC_DECODER_C_AVX512_FLOOD, base_graph, lift_size, MS_SF) != 0) {
perror("decoder init");
exit(-1);
}
#endif // LV_HAVE_AVX512
// create a random generator
srslte_random_t random_gen = srslte_random_init(0);
uint32_t F = encoder.bgK - 5; // This value is arbitrary
finalK = encoder.liftK;
finalN = encoder.liftN - 2 * lift_size;
if (rm_length == 0) {
rm_length = finalN - F;
}
@ -226,9 +254,6 @@ int main(int argc, char** argv)
1.0 * (encoder.liftK - F) / rm_length);
printf("\n Signal-to-Noise Ratio -> %.2f dB\n", snr);
finalK = encoder.liftK;
finalN = encoder.liftN - 2 * lift_size;
messages_true = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_f = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_s = srslte_vec_u8_malloc(finalK * batch_size);
@ -236,13 +261,16 @@ int main(int argc, char** argv)
messages_sim_c_flood = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_avx = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_avx_flood = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_avx512 = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_avx512_flood = srslte_vec_u8_malloc(finalK * batch_size);
codewords = srslte_vec_u8_malloc(finalN * batch_size);
symbols_rm = srslte_vec_f_malloc((rm_length + F) * batch_size);
symbols = srslte_vec_f_malloc(finalN * batch_size);
symbols_s = srslte_vec_i16_malloc(finalN * batch_size);
symbols_c = srslte_vec_i8_malloc(finalN * batch_size);
if (!messages_true || !messages_sim_f || !messages_sim_s || !messages_sim_c || //
!messages_sim_avx || !messages_sim_c_flood || !messages_sim_avx_flood || //
!messages_sim_avx512 || !messages_sim_avx || !messages_sim_c_flood || !messages_sim_avx512_flood ||
!messages_sim_avx_flood || //
!codewords || !symbols || !symbols_s || !symbols_c) {
perror("malloc");
exit(-1);
@ -267,6 +295,12 @@ int main(int argc, char** argv)
int n_error_words_avx = 0;
int n_error_words_avx_flood = 0;
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
double elapsed_time_dec_avx512 = 0;
int n_error_words_avx512 = 0;
double elapsed_time_dec_avx512_flood = 0;
int n_error_words_avx512_flood = 0;
#endif // lV_HAVE_AVX512
float noise_std_dev = srslte_convert_dB_to_amplitude(-snr);
@ -472,6 +506,51 @@ int main(int argc, char** argv)
}
}
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
//////// Fixed point - 8 bit - AVX512 version
// Recover messages
gettimeofday(&t[1], NULL);
for (j = 0; j < batch_size; j++) {
srslte_ldpc_decoder_decode_rm_c(
&decoder_avx512, symbols_c + j * finalN, messages_sim_avx512 + j * finalK, n_useful_symbols);
}
gettimeofday(&t[2], NULL);
get_time_interval(t);
elapsed_time_dec_avx512 += t[0].tv_sec + 1e-6 * t[0].tv_usec;
for (i = 0; i < batch_size; i++) {
for (j = 0; j < finalK; j++) {
i_bit = i * finalK + j;
if (messages_sim_avx512[i_bit] != (1U & messages_true[i_bit])) {
n_error_words_avx512++;
break;
}
}
}
//////// Fixed point - 8 bit, flooded scheduling - AVX512 version
// Recover messages
gettimeofday(&t[1], NULL);
for (j = 0; j < batch_size; j++) {
srslte_ldpc_decoder_decode_rm_c(
&decoder_avx512_flood, symbols_c + j * finalN, messages_sim_avx512_flood + j * finalK, n_useful_symbols);
}
gettimeofday(&t[2], NULL);
get_time_interval(t);
elapsed_time_dec_avx512_flood += t[0].tv_sec + 1e-6 * t[0].tv_usec;
for (i = 0; i < batch_size; i++) {
for (j = 0; j < finalK; j++) {
i_bit = i * finalK + j;
if (messages_sim_avx512_flood[i_bit] != (1U & messages_true[i_bit])) {
n_error_words_avx512_flood++;
break;
}
}
}
#endif // LV_HAVE_AVX512
}
printf("\nEstimated throughput encoder:\n %e word/s\n %e bit/s (information)\n %e bit/s (encoded)\n",
@ -490,6 +569,15 @@ int main(int argc, char** argv)
"FIXED POINT (8 bits, flooded scheduling - AVX2)", i_batch, n_error_words_avx_flood, elapsed_time_dec_avx_flood);
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
print_decoder("FIXED POINT (8 bits - AVX512)", i_batch, n_error_words_avx512, elapsed_time_dec_avx512);
print_decoder("FIXED POINT (8 bits, flooded scheduling - AVX512)",
i_batch,
n_error_words_avx512_flood,
elapsed_time_dec_avx512_flood);
#endif // LV_HAVE_AVX512
if (n_error_words_s > 10 * n_error_words_f) {
perror("16-bit performance too low!");
exit(-1);
@ -498,6 +586,17 @@ int main(int argc, char** argv)
perror("8-bit performance too low!");
exit(-1);
}
#ifdef LV_HAVE_AVX512
if (n_error_words_avx512 != n_error_words_avx) {
perror("The number of errors AVX512 and AVX2 differs !");
exit(-1);
}
if (n_error_words_avx512_flood != n_error_words_avx_flood) {
perror("The number of errors of flood AVX512 and AVX2 differs !");
exit(-1);
}
#endif // LV_HAVE_AVX512
printf("\nTest completed successfully!\n\n");
free(symbols_c);
@ -506,7 +605,9 @@ int main(int argc, char** argv)
free(codewords);
free(symbols_rm);
free(messages_sim_avx);
free(messages_sim_avx512);
free(messages_sim_avx_flood);
free(messages_sim_avx512_flood);
free(messages_sim_c_flood);
free(messages_sim_c);
free(messages_sim_s);
@ -516,6 +617,10 @@ int main(int argc, char** argv)
#ifdef LV_HAVE_AVX2
srslte_ldpc_decoder_free(&decoder_avx);
srslte_ldpc_decoder_free(&decoder_avx_flood);
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
srslte_ldpc_decoder_free(&decoder_avx512);
srslte_ldpc_decoder_free(&decoder_avx512_flood);
#endif // LV_HAVE_AVX2
srslte_ldpc_decoder_free(&decoder_c_flood);
srslte_ldpc_decoder_free(&decoder_c);

@ -42,6 +42,7 @@ int finalN; /*!< \brief Number of coded bits (codeword
int scheduling = 0; /*!< \brief Message scheduling (0 for layered, 1 for flooded). */
#define NOF_MESSAGES 10 /*!< \brief Number of codewords in the test. */
static int nof_reps = 1; /*!< \brief Number of times tests are repeated (for computing throughput). */
/*!
* \brief Prints test help when a wrong parameter is passed as input.
@ -52,6 +53,7 @@ void usage(char* prog)
printf("\t-b Base Graph [(1 or 2) Default %d]\n", base_graph + 1);
printf("\t-l Lifting Size [Default %d]\n", lift_size);
printf("\t-x Scheduling [Default %c]\n", scheduling);
printf("\t-R Number of times tests are repeated (for computing throughput). [Default %d]\n", nof_reps);
}
/*!
@ -60,7 +62,7 @@ void usage(char* prog)
void parse_args(int argc, char** argv)
{
int opt = 0;
while ((opt = getopt(argc, argv, "b:l:x:")) != -1) {
while ((opt = getopt(argc, argv, "b:l:x:R:")) != -1) {
switch (opt) {
case 'b':
base_graph = (int)strtol(optarg, NULL, 10) - 1;
@ -71,6 +73,9 @@ void parse_args(int argc, char** argv)
case 'x':
scheduling = (int)strtol(optarg, NULL, 10);
break;
case 'R':
nof_reps = (int)strtol(optarg, NULL, 10);
break;
default:
usage(argv[0]);
exit(-1);
@ -138,6 +143,7 @@ int main(int argc, char** argv)
int8_t* symbols = NULL;
int i = 0;
int j = 0;
int l = 0;
FILE* ex_file = NULL;
char file_name[1000];
@ -195,14 +201,19 @@ int main(int argc, char** argv)
printf("\nDecoding test messages...\n");
struct timeval t[3];
gettimeofday(&t[1], NULL);
double elapsed_time = 0;
for (j = 0; j < NOF_MESSAGES; j++) {
printf(" codeword %d\n", j);
gettimeofday(&t[1], NULL);
for (l = 0; l < nof_reps; l++) {
srslte_ldpc_decoder_decode_rm_c(&decoder, symbols + j * finalN, messages_sim + j * finalK, finalN);
}
gettimeofday(&t[2], NULL);
get_time_interval(t);
double elapsed_time = t[0].tv_sec + 1e-6 * t[0].tv_usec;
elapsed_time += t[0].tv_sec + 1e-6 * t[0].tv_usec;
}
printf("Elapsed time: %e s\n", elapsed_time);
printf("\nVerifing results...\n");

@ -0,0 +1,238 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2020 Software Radio Systems Limited
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
*
*/
/*!
* \file ldpc_dec_avx512_test.c
* \brief Unit test for the LDPC decoder working with 8-bit integer-valued LLRs (AVX512 implementation).
*
* It decodes a batch of example codewords and compares the resulting messages
* with the expected ones. Reference messages and codewords are provided in
* files **examplesBG1.dat** and **examplesBG2.dat**.
*
* Synopsis: **ldpc_dec_c_test [options]**
*
* Options:
* - **-b \<number\>** Base Graph (1 or 2. Default 1).
* - **-l \<number\>** Lifting Size (according to 5GNR standard. Default 2).
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "srslte/phy/fec/ldpc/ldpc_common.h"
#include "srslte/phy/fec/ldpc/ldpc_decoder.h"
#include "srslte/phy/utils/debug.h"
srslte_basegraph_t base_graph = BG1; /*!< \brief Base Graph (BG1 or BG2). */
int lift_size = 2; /*!< \brief Lifting Size. */
int finalK; /*!< \brief Number of uncoded bits (message length). */
int finalN; /*!< \brief Number of coded bits (codeword length). */
int scheduling = 0; /*!< \brief Message scheduling (0 for layered, 1 for flooded). */
#define NOF_MESSAGES 10 /*!< \brief Number of codewords in the test. */
static int nof_reps = 1; /*!< \brief Number of times tests are repeated (for computing throughput). */
/*!
* \brief Prints test help when a wrong parameter is passed as input.
*/
void usage(char* prog)
{
printf("Usage: %s [-bX] [-lX]\n", prog);
printf("\t-b Base Graph [(1 or 2) Default %d]\n", base_graph + 1);
printf("\t-l Lifting Size [Default %d]\n", lift_size);
printf("\t-x Scheduling [Default %c]\n", scheduling);
printf("\t-R Number of times tests are repeated (for computing throughput). [Default %d]\n", nof_reps);
}
/*!
* \brief Parses the input line.
*/
void parse_args(int argc, char** argv)
{
int opt = 0;
while ((opt = getopt(argc, argv, "b:l:x:R:")) != -1) {
switch (opt) {
case 'b':
base_graph = (int)strtol(optarg, NULL, 10) - 1;
break;
case 'l':
lift_size = (int)strtol(optarg, NULL, 10);
break;
case 'x':
scheduling = (int)strtol(optarg, NULL, 10);
break;
case 'R':
nof_reps = (int)strtol(optarg, NULL, 10);
break;
default:
usage(argv[0]);
exit(-1);
}
}
}
/*!
* \brief Reads the example file.
*/
void get_examples(uint8_t* messages, //
uint8_t* codewords,
FILE* ex_file)
{
char mstr[15]; // message string
char cstr[15]; // codeword string
char tmp[15];
int i = 0;
int j = 0;
sprintf(mstr, "ls%dmsgs", lift_size);
sprintf(cstr, "ls%dcwds", lift_size);
do {
do {
tmp[0] = fgetc(ex_file);
} while (tmp[0] != 'l');
fscanf(ex_file, "%[^\n]", tmp + 1);
fgetc(ex_file); // discard newline
} while (strcmp(tmp, mstr) != 0);
// read messages
for (j = 0; j < NOF_MESSAGES; j++) {
for (i = 0; i < finalK; i++) {
int rc = fgetc(ex_file);
messages[j * finalK + i] = (uint8_t)(rc == '-' ? FILLER_BIT : rc - '0');
}
fgetc(ex_file); // discard newline
}
fscanf(ex_file, "%[^\n]", tmp);
if (strcmp(tmp, cstr) != 0) {
printf("Something went wrong while reading example file.\n");
exit(-1);
}
fgetc(ex_file); // discard newline
// read codewords
for (j = 0; j < NOF_MESSAGES; j++) {
for (i = 0; i < finalN; i++) {
int rc = fgetc(ex_file);
codewords[j * finalN + i] = (uint8_t)(rc == '-' ? FILLER_BIT : rc - '0');
}
fgetc(ex_file); // discard newline
}
}
/*!
* \brief Main test function.
*/
int main(int argc, char** argv)
{
uint8_t* messages_true = NULL;
uint8_t* messages_sim = NULL;
uint8_t* codewords = NULL;
int8_t* symbols = NULL;
int i = 0;
int j = 0;
int l = 0;
FILE* ex_file = NULL;
char file_name[1000];
parse_args(argc, argv);
srslte_ldpc_decoder_type_t dectype =
(scheduling == 0) ? SRSLTE_LDPC_DECODER_C_AVX512 : SRSLTE_LDPC_DECODER_C_AVX512_FLOOD;
// create an LDPC decoder
srslte_ldpc_decoder_t decoder;
if (srslte_ldpc_decoder_init(&decoder, dectype, base_graph, lift_size, 1) != 0) {
perror("decoder init");
exit(-1);
}
printf("Test LDPC decoder:\n");
printf(" Base Graph -> BG%d\n", decoder.bg + 1);
printf(" Lifting Size -> %d\n", decoder.ls);
printf(" Protograph -> M = %d, N = %d, K = %d\n", decoder.bgM, decoder.bgN, decoder.bgK);
printf(" Lifted graph -> M = %d, N = %d, K = %d\n", decoder.liftM, decoder.liftN, decoder.liftK);
printf(" Final code rate -> K/(N-2) = %d/%d = 1/%d\n",
decoder.liftK,
decoder.liftN - 2 * lift_size,
decoder.bg == BG1 ? 3 : 5);
printf(" Scheduling: %s\n", scheduling ? "flooded" : "layered");
finalK = decoder.liftK;
finalN = decoder.liftN - 2 * lift_size;
messages_true = malloc(finalK * NOF_MESSAGES * sizeof(uint8_t));
messages_sim = malloc(finalK * NOF_MESSAGES * sizeof(uint8_t));
codewords = malloc(finalN * NOF_MESSAGES * sizeof(uint8_t));
symbols = malloc(finalN * NOF_MESSAGES * sizeof(int8_t));
if (!messages_true || !messages_sim || !codewords || !symbols) {
perror("malloc");
exit(-1);
}
sprintf(file_name, "examplesBG%d.dat", base_graph + 1);
printf("\nReading example file %s...\n", file_name);
ex_file = fopen(file_name, "re");
if (ex_file == NULL) {
perror("fopen");
exit(-1);
}
get_examples(messages_true, codewords, ex_file);
fclose(ex_file);
for (i = 0; i < NOF_MESSAGES * finalN; i++) {
symbols[i] = codewords[i] == 1 ? -2 : 2;
}
printf("\nDecoding test messages...\n");
struct timeval t[3];
double elapsed_time = 0;
for (j = 0; j < NOF_MESSAGES; j++) {
printf(" codeword %d\n", j);
gettimeofday(&t[1], NULL);
for (l = 0; l < nof_reps; l++) {
srslte_ldpc_decoder_decode_rm_c(&decoder, symbols + j * finalN, messages_sim + j * finalK, finalN);
}
gettimeofday(&t[2], NULL);
get_time_interval(t);
elapsed_time += t[0].tv_sec + 1e-6 * t[0].tv_usec;
}
printf("Elapsed time: %e s\n", elapsed_time);
printf("\nVerifing results...\n");
for (i = 0; i < NOF_MESSAGES * finalK; i++) {
if ((1U & messages_sim[i]) != (1U & messages_true[i])) {
perror("wrong!!");
exit(-1);
}
}
printf("Estimated throughput:\n %e word/s\n %.3f Mbit/s (information)\n %.3f Mbit/s (encoded)\n",
NOF_MESSAGES / (elapsed_time / nof_reps),
NOF_MESSAGES * finalK / (elapsed_time / nof_reps) / 1e6,
NOF_MESSAGES * finalN / (elapsed_time / nof_reps) / 1e6);
printf("\nTest completed successfully!\n\n");
free(symbols);
free(codewords);
free(messages_sim);
free(messages_true);
srslte_ldpc_decoder_free(&decoder);
}

@ -0,0 +1,221 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2020 Software Radio Systems Limited
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
*
*/
/*!
* \file ldpc_enc_avx512_test.c
* \brief Unit test for the LDPC encoder (SIMD-optimized version).
*
* It encodes a batch of example messages and compares the resulting codewords
* with the expected ones. Reference messages and codewords are provided in
* files **examplesBG1.dat** and **examplesBG2.dat**.
*
* Synopsis: **ldpc_enc_test [options]**
*
* Options:
* - **-b \<number\>** Base Graph (1 or 2. Default 1).
* - **-l \<number\>** Lifting Size (according to 5GNR standard. Default 2).
* - **-R \<number\>** Number of times tests are repeated (for computing throughput).
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "srslte/phy/fec/ldpc/ldpc_common.h"
#include "srslte/phy/fec/ldpc/ldpc_encoder.h"
#include "srslte/phy/utils/debug.h"
srslte_basegraph_t base_graph = BG1; /*!< \brief Base Graph (BG1 or BG2). */
int lift_size = 2; /*!< \brief Lifting Size. */
int finalK; /*!< \brief Number of uncoded bits (message length). */
int finalN; /*!< \brief Number of coded bits (codeword length). */
#define NOF_MESSAGES 10 /*!< \brief Number of codewords in the test. */
static int nof_reps = 1; /*!< \brief Number of times tests are repeated (for computing throughput). */
/*!
* \brief Prints test help when a wrong parameter is passed as input.
*/
void usage(char* prog)
{
printf("Usage: %s [-bX] [-lX]\n", prog);
printf("\t-b Base Graph [(1 or 2) Default %d]\n", base_graph + 1);
printf("\t-l Lifting Size [Default %d]\n", lift_size);
printf("\t-R Number of times tests are repeated (for computing throughput). [Default %d]\n", nof_reps);
}
/*!
* \brief Parses the input line.
*/
void parse_args(int argc, char** argv)
{
int opt = 0;
while ((opt = getopt(argc, argv, "b:l:R:")) != -1) {
switch (opt) {
case 'b':
base_graph = (int)strtol(optarg, NULL, 10) - 1;
break;
case 'l':
lift_size = (int)strtol(optarg, NULL, 10);
break;
case 'R':
nof_reps = (int)strtol(optarg, NULL, 10);
break;
default:
usage(argv[0]);
exit(-1);
}
}
}
/*!
* \brief Reads the example file.
*/
void get_examples(uint8_t* messages, //
uint8_t* codewords,
FILE* ex_file)
{
char mstr[15]; // message string
char cstr[15]; // codeword string
char tmp[15];
int i = 0;
int j = 0;
sprintf(mstr, "ls%dmsgs", lift_size);
sprintf(cstr, "ls%dcwds", lift_size);
do {
do {
tmp[0] = fgetc(ex_file);
} while (tmp[0] != 'l');
fscanf(ex_file, "%[^\n]", tmp + 1);
fgetc(ex_file); // discard newline
} while (strcmp(tmp, mstr) != 0);
// read messages
for (j = 0; j < NOF_MESSAGES; j++) {
for (i = 0; i < finalK; i++) {
int rc = fgetc(ex_file);
messages[j * finalK + i] = (uint8_t)(rc == '-' ? FILLER_BIT : rc - '0');
}
fgetc(ex_file); // discard newline
}
fscanf(ex_file, "%[^\n]", tmp);
if (strcmp(tmp, cstr) != 0) {
printf("Something went wrong while reading example file.\n");
exit(-1);
}
fgetc(ex_file); // discard newline
// read codewords
for (j = 0; j < NOF_MESSAGES; j++) {
for (i = 0; i < finalN; i++) {
int rc = fgetc(ex_file);
codewords[j * finalN + i] = (uint8_t)(rc == '-' ? FILLER_BIT : rc - '0');
}
fgetc(ex_file); // discard newline
}
}
/*!
* \brief Main test function.
*/
int main(int argc, char** argv)
{
uint8_t* messages = NULL;
uint8_t* codewords_true = NULL;
uint8_t* codewords_sim = NULL;
int i = 0;
int j = 0;
int l = 0;
FILE* ex_file = NULL;
char file_name[1000];
parse_args(argc, argv);
// create an LDPC encoder
srslte_ldpc_encoder_t encoder;
if (srslte_ldpc_encoder_init(&encoder, SRSLTE_LDPC_ENCODER_AVX512, base_graph, lift_size) != 0) {
perror("encoder init");
exit(-1);
}
printf("Test LDPC encoder:\n");
printf(" Base Graph -> BG%d\n", encoder.bg + 1);
printf(" Lifting Size -> %d\n", encoder.ls);
printf(" Protograph -> M = %d, N = %d, K = %d\n", encoder.bgM, encoder.bgN, encoder.bgK);
printf(" Lifted graph -> M = %d, N = %d, K = %d\n", encoder.liftM, encoder.liftN, encoder.liftK);
printf(" Final code rate -> K/(N-2) = %d/%d = 1/%d\n",
encoder.liftK,
encoder.liftN - 2 * lift_size,
encoder.bg == BG1 ? 3 : 5);
finalK = encoder.liftK;
finalN = encoder.liftN - 2 * lift_size;
messages = malloc(finalK * NOF_MESSAGES * sizeof(uint8_t));
codewords_true = malloc(finalN * NOF_MESSAGES * sizeof(uint8_t));
codewords_sim = malloc(finalN * NOF_MESSAGES * sizeof(uint8_t));
if (!messages || !codewords_true || !codewords_sim) {
perror("malloc");
exit(-1);
}
sprintf(file_name, "examplesBG%d.dat", base_graph + 1);
printf("\nReading example file %s...\n", file_name);
ex_file = fopen(file_name, "re");
if (ex_file == NULL) {
perror("fopen");
exit(-1);
}
get_examples(messages, codewords_true, ex_file);
fclose(ex_file);
printf("\nEncoding test messages...\n");
struct timeval t[3];
double elapsed_time = 0;
for (j = 0; j < NOF_MESSAGES; j++) {
printf(" codeword %d\n", j);
gettimeofday(&t[1], NULL);
for (l = 0; l < nof_reps; l++) {
srslte_ldpc_encoder_encode_rm(&encoder, messages + j * finalK, codewords_sim + j * finalN, finalK, finalN);
}
gettimeofday(&t[2], NULL);
get_time_interval(t);
elapsed_time += t[0].tv_sec + 1e-6 * t[0].tv_usec;
}
printf("Elapsed time: %e s\n", elapsed_time / nof_reps);
printf("Estimated throughput:\n %e word/s\n %.3f Mbit/s (information)\n %.3f Mbit/s (encoded)\n",
NOF_MESSAGES / (elapsed_time / nof_reps),
NOF_MESSAGES * finalK / (elapsed_time / nof_reps) / 1e6,
NOF_MESSAGES * finalN / (elapsed_time / nof_reps) / 1e6);
printf("\nVerifing results...\n");
for (i = 0; i < NOF_MESSAGES * finalN; i++) {
if (codewords_sim[i] != codewords_true[i]) {
perror("wrong!!");
exit(-1);
}
}
printf("\nTest completed successfully!\n\n");
free(codewords_sim);
free(codewords_true);
free(messages);
srslte_ldpc_encoder_free(&encoder);
}

@ -75,7 +75,6 @@ static int req_errors = 100; /*!< \brief Minimum number of errors for a signi
*/
void usage(char* prog)
{
printf("Usage: %s [-bX] [-lX] [-eX] [-fX] [-rX] [-mX] [-MX] [-wX] [-sX]\n", prog);
printf("\t-b Base Graph [(1 or 2) Default %d]\n", base_graph + 1);
printf("\t-l Lifting Size [Default %d]\n", lift_size);
@ -152,7 +151,6 @@ void print_decoder(char* title, int n_batches, int n_errors, double elapsed_time
*/
int main(int argc, char** argv)
{
uint8_t* messages_true = NULL;
uint8_t* messages_sim_f = NULL;
uint8_t* messages_sim_s = NULL;
@ -160,6 +158,8 @@ int main(int argc, char** argv)
uint8_t* messages_sim_c_flood = NULL;
uint8_t* messages_sim_avx = NULL;
uint8_t* messages_sim_avx_flood = NULL;
uint8_t* messages_sim_avx512 = NULL;
uint8_t* messages_sim_avx512_flood = NULL;
uint8_t* codewords = NULL;
uint8_t* rm_codewords = NULL;
float* rm_symbols = NULL;
@ -176,6 +176,13 @@ int main(int argc, char** argv)
// create an LDPC encoder
srslte_ldpc_encoder_t encoder;
#ifdef LV_HAVE_AVX512
if (srslte_ldpc_encoder_init(&encoder, SRSLTE_LDPC_ENCODER_AVX512, base_graph, lift_size) != 0) {
perror("encoder init");
exit(-1);
}
#else // no AVX512
#ifdef LV_HAVE_AVX2
if (srslte_ldpc_encoder_init(&encoder, SRSLTE_LDPC_ENCODER_AVX2, base_graph, lift_size) != 0) {
perror("encoder init");
@ -187,6 +194,7 @@ int main(int argc, char** argv)
exit(-1);
}
#endif // LV_HAVE_AVX2
#endif // LV_HAVE_AVX512
// create a LDPC rate DeMatcher
finalK = encoder.liftK;
@ -267,6 +275,23 @@ int main(int argc, char** argv)
}
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
// create an LDPC decoder (8 bit, AVX2 version)
srslte_ldpc_decoder_t decoder_avx512;
if (srslte_ldpc_decoder_init(&decoder_avx512, SRSLTE_LDPC_DECODER_C_AVX512, base_graph, lift_size, MS_SF) != 0) {
perror("decoder init");
exit(-1);
}
// create an LDPC decoder (8 bit, flooded scheduling, AVX512 version)
srslte_ldpc_decoder_t decoder_avx512_flood;
if (srslte_ldpc_decoder_init(
&decoder_avx512_flood, SRSLTE_LDPC_DECODER_C_AVX512_FLOOD, base_graph, lift_size, MS_SF) != 0) {
perror("decoder init");
exit(-1);
}
#endif // LV_HAVE_AVX512
// create a random generator
srslte_random_t random_gen = srslte_random_init(0);
@ -299,6 +324,8 @@ int main(int argc, char** argv)
messages_sim_c_flood = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_avx = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_avx_flood = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_avx512 = srslte_vec_u8_malloc(finalK * batch_size);
messages_sim_avx512_flood = srslte_vec_u8_malloc(finalK * batch_size);
codewords = srslte_vec_u8_malloc(finalN * batch_size);
rm_codewords = srslte_vec_u8_malloc(rm_length * batch_size);
rm_symbols = srslte_vec_f_malloc(rm_length * batch_size);
@ -309,9 +336,9 @@ int main(int argc, char** argv)
symbols_s = srslte_vec_i16_malloc(finalN * batch_size);
symbols_c = srslte_vec_i8_malloc(finalN * batch_size);
if (!messages_true || !messages_sim_f || !messages_sim_s || !messages_sim_c || //
!messages_sim_avx || !messages_sim_c_flood || !messages_sim_avx_flood || //
!codewords || !rm_codewords || !rm_symbols || !rm_symbols_s || !rm_symbols_c || !symbols || !symbols_s ||
!symbols_c) {
!messages_sim_avx || !messages_sim_avx || !messages_sim_c_flood || !messages_sim_avx_flood || //
!messages_sim_avx512_flood || !codewords || !rm_codewords || !rm_symbols || !rm_symbols_s || !rm_symbols_c ||
!symbols || !symbols_s || !symbols_c) {
perror("malloc");
exit(-1);
}
@ -336,6 +363,13 @@ int main(int argc, char** argv)
int n_error_words_avx_flood = 0;
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
double elapsed_time_dec_avx512 = 0;
int n_error_words_avx512 = 0;
double elapsed_time_dec_avx512_flood = 0;
int n_error_words_avx512_flood = 0;
#endif // LV_HAVE_AVX512
float noise_std_dev = srslte_convert_dB_to_amplitude(-snr);
int16_t inf15 = (1U << 14U) - 1;
@ -604,6 +638,52 @@ int main(int argc, char** argv)
}
}
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
//////// Fixed point - 8 bit - AVX512 version
// Recover messages
gettimeofday(&t[1], NULL);
for (j = 0; j < batch_size; j++) {
srslte_ldpc_decoder_decode_rm_c(
&decoder_avx512, symbols_c + j * finalN, messages_sim_avx512 + j * finalK, finalN);
}
gettimeofday(&t[2], NULL);
get_time_interval(t);
elapsed_time_dec_avx512 += t[0].tv_sec + 1e-6 * t[0].tv_usec;
for (i = 0; i < batch_size; i++) {
for (j = 0; j < finalK; j++) {
i_bit = i * finalK + j;
if (messages_sim_avx512[i_bit] != (1U & messages_true[i_bit])) {
n_error_words_avx512++;
break;
}
}
}
//////// Fixed point - 8 bit, flooded scheduling - AVX512 version
// Recover messages
gettimeofday(&t[1], NULL);
for (j = 0; j < batch_size; j++) {
srslte_ldpc_decoder_decode_rm_c(
&decoder_avx512_flood, symbols_c + j * finalN, messages_sim_avx512_flood + j * finalK, finalN);
}
gettimeofday(&t[2], NULL);
get_time_interval(t);
elapsed_time_dec_avx512_flood += t[0].tv_sec + 1e-6 * t[0].tv_usec;
for (i = 0; i < batch_size; i++) {
for (j = 0; j < finalK; j++) {
i_bit = i * finalK + j;
if (messages_sim_avx512_flood[i_bit] != (1U & messages_true[i_bit])) {
n_error_words_avx512_flood++;
break;
}
}
}
#endif // LV_HAVE_AVX512
}
printf("\nEstimated throughput encoder:\n %e word/s\n %e bit/s (information)\n %e bit/s (encoded)\n",
@ -622,6 +702,14 @@ int main(int argc, char** argv)
"FIXED POINT (8 bits, flooded scheduling - AVX2)", i_batch, n_error_words_avx_flood, elapsed_time_dec_avx_flood);
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
print_decoder("FIXED POINT (8 bits - AVX512)", i_batch, n_error_words_avx512, elapsed_time_dec_avx512);
print_decoder("FIXED POINT (8 bits, flooded scheduling - AVX512)",
i_batch,
n_error_words_avx512_flood,
elapsed_time_dec_avx512_flood);
#endif // LV_HAVE_AVX512
if (n_error_words_s > 10 * n_error_words_f) {
perror("16-bit performance too low!");
exit(-1);
@ -630,6 +718,17 @@ int main(int argc, char** argv)
perror("8-bit performance too low!");
exit(-1);
}
#ifdef LV_HAVE_AVX512
if (n_error_words_avx512 != n_error_words_avx) {
perror("The number of errors AVX512 and AVX2 differs !");
exit(-1);
}
if (n_error_words_avx512_flood != n_error_words_avx_flood) {
perror("The number of errors of flooded AVX512 and AVX2 differs !");
exit(-1);
}
#endif // LV_HAVE_AVX512
printf("\nTest completed successfully!\n\n");
free(symbols);
@ -642,6 +741,8 @@ int main(int argc, char** argv)
free(codewords);
free(messages_sim_avx);
free(messages_sim_avx_flood);
free(messages_sim_avx512);
free(messages_sim_avx512_flood);
free(messages_sim_c_flood);
free(messages_sim_c);
free(messages_sim_s);
@ -652,6 +753,10 @@ int main(int argc, char** argv)
srslte_ldpc_decoder_free(&decoder_avx);
srslte_ldpc_decoder_free(&decoder_avx_flood);
#endif // LV_HAVE_AVX2
#ifdef LV_HAVE_AVX512
srslte_ldpc_decoder_free(&decoder_avx512);
srslte_ldpc_decoder_free(&decoder_avx512_flood);
#endif // LV_HAVE_AVX512
srslte_ldpc_decoder_free(&decoder_c_flood);
srslte_ldpc_decoder_free(&decoder_c);
srslte_ldpc_decoder_free(&decoder_s);

@ -1,12 +1,21 @@
/**
/*
* Copyright 2013-2020 Software Radio Systems Limited
*
* \section COPYRIGHT
* This file is part of srsLTE.
*
* Copyright 2013-2020 Software Radio Systems Limited
* srsLTE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* srsLTE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
* A copy of the GNU Affero General Public License can be found in
* the LICENSE file in the top-level directory of this distribution
* and at http://www.gnu.org/licenses/.
*
*/

@ -0,0 +1,29 @@
/**
*
* \section COPYRIGHT
*
* Copyright 2013-2020 Software Radio Systems Limited
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the distribution.
*
*/
/*!
* \file utils_avx512.h
* \brief Declarations of AVX512-related quantities and functions.
* \author Jesus Gomez
* \date 2020
*
* \copyright Software Radio Systems Limited
*
*/
#ifndef SRSLTE_UTILS_AVX512_H
#define SRSLTE_UTILS_AVX512_H
#define SRSLTE_AVX512_B_SIZE 64 /*!< \brief Number of packed bytes in an AVX512 instruction. */
#define SRSLTE_AVX512_B_SIZE_LOG 6 /*!< \brief \f$\log_2\f$ of \ref SRSLTE_AVX512_B_SIZE. */
#endif // SRSLTE_UTILS_AVX512_H

@ -192,11 +192,18 @@ int srslte_sch_nr_init_tx(srslte_sch_nr_t* q, const srslte_sch_nr_args_t* args)
}
srslte_ldpc_encoder_type_t encoder_type = SRSLTE_LDPC_ENCODER_C;
#ifdef LV_HAVE_AVX512
if (!args->disable_simd) {
encoder_type = SRSLTE_LDPC_ENCODER_AVX512;
}
#else // LV_HAVE_AVX512
#ifdef LV_HAVE_AVX2
if (!args->disable_simd) {
encoder_type = SRSLTE_LDPC_ENCODER_AVX2;
}
#endif // LV_HAVE_AVX2
#endif // LV_HAVE_AVX612
// Iterate over all possible lifting sizes
for (uint16_t ls = 0; ls <= MAX_LIFTSIZE; ls++) {
@ -246,24 +253,20 @@ int srslte_sch_nr_init_rx(srslte_sch_nr_t* q, const srslte_sch_nr_args_t* args)
return ret;
}
srslte_ldpc_decoder_type_t decoder_type = SRSLTE_LDPC_DECODER_C;
if (args->decoder_use_flooded) {
#ifdef LV_HAVE_AVX2
if (args->disable_simd) {
decoder_type = SRSLTE_LDPC_DECODER_C_FLOOD;
} else {
decoder_type = SRSLTE_LDPC_DECODER_C_AVX2_FLOOD;
srslte_ldpc_decoder_type_t decoder_type =
args->decoder_use_flooded ? SRSLTE_LDPC_DECODER_C_FLOOD : SRSLTE_LDPC_DECODER_C;
#ifdef LV_HAVE_AVX512
if (!args->disable_simd) {
decoder_type = args->decoder_use_flooded ? SRSLTE_LDPC_DECODER_C_AVX512_FLOOD : SRSLTE_LDPC_DECODER_C_AVX512;
}
#else // LV_HAVE_AVX2
decoder_type = SRSLTE_LDPC_DECODER_C_FLOOD;
#endif // LV_HAVE_AVX2
} else {
#else // LV_HAVE_AVX512
#ifdef LV_HAVE_AVX2
if (!args->disable_simd) {
decoder_type = SRSLTE_LDPC_DECODER_C_AVX2;
decoder_type = args->decoder_use_flooded ? SRSLTE_LDPC_DECODER_C_AVX2_FLOOD : SRSLTE_LDPC_DECODER_C_AVX2;
}
#endif // LV_HAVE_AVX2
}
#endif // LV_HAVE_AVX512
// If the scaling factor is not provided use a default value that allows decoding all possible combinations of nPRB
// and MCS indexes for all possible MCS tables

Loading…
Cancel
Save