Optimized PR sequence generator

master
Xavier Arteaga 5 years ago committed by Xavier Arteaga
parent 375ac1388a
commit d76c77c188

@ -19,124 +19,288 @@
* *
*/ */
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include "srslte/phy/common/sequence.h" #include "srslte/phy/common/sequence.h"
#include "srslte/phy/utils/bit.h" #include "srslte/phy/utils/bit.h"
#include "srslte/phy/utils/debug.h" #include "srslte/phy/utils/debug.h"
#include "srslte/phy/utils/vector.h" #include "srslte/phy/utils/vector.h"
#define Nc 1600 #ifdef LV_HAVE_SSE
#include <immintrin.h>
#include <srslte/phy/common/sequence.h>
#endif /* LV_HAVE_SSE */
#define MAX_SEQ_LEN (256 * 1024) /**
* Length of the seed, used for the feedback delay. Do not change.
*/
#define SEQUENCE_SEED_LEN (31)
#define static_memory /**
* Nc parameter defined in 3GPP. Do not change.
*/
#define SEQUENCE_NC (1600)
/* /*
* Pseudo Random Sequence generation. * Pseudo Random Sequence generation.
* It follows the 3GPP Release 8 (LTE) 36.211 * It follows the 3GPP Release 8 (LTE) 36.211
* Section 7.2 * Section 7.2
*/ */
#ifdef static_memory
static uint8_t x1[Nc + MAX_SEQ_LEN + 31];
static uint8_t x2[Nc + MAX_SEQ_LEN + 31];
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; /**
int srslte_sequence_set_LTE_pr(srslte_sequence_t* q, uint32_t len, uint32_t seed) * Parallel bit generation for x1/x2 sequences parameters. Exploits the fact that the sequence generation is 31 chips
* ahead and the maximum register shift is 3 (for x2).
*/
#define SEQUENCE_PAR_BITS (28U)
#define SEQUENCE_MASK ((1U << SEQUENCE_PAR_BITS) - 1U)
/**
* Computes one step of the X1 sequence for SEQUENCE_PAR_BITS simultaneously
* @param state 32 bit current state
* @return new 32 bit state
*/
static inline uint32_t sequence_gen_LTE_pr_memless_step_par_x1(uint32_t state)
{ {
int n; // Perform XOR
uint32_t f = state ^ (state >> 3U);
if (len > q->max_len) { // Prepare feedback
ERROR("Error generating pseudo-random sequence: len %d exceeds maximum len %d\n", len, MAX_SEQ_LEN); f = ((f & SEQUENCE_MASK) << (SEQUENCE_SEED_LEN - SEQUENCE_PAR_BITS));
return -1;
// Insert feedback
state = (state >> SEQUENCE_PAR_BITS) ^ f;
return state;
}
/**
* Computes one step of the X1 sequence for 1bit
* @param state 32 bit current state
* @return new 32 bit state
*/
static inline uint32_t sequence_gen_LTE_pr_memless_step_x1(uint32_t state)
{
// Perform XOR
uint32_t f = state ^ (state >> 3U);
// Prepare feedback
f = ((f & 1U) << (SEQUENCE_SEED_LEN - 1U));
// Insert feedback
state = (state >> 1U) ^ f;
return state;
}
/**
* Computes one step of the X2 sequence for SEQUENCE_PAR_BITS simultaneously
* @param state 32 bit current state
* @return new 32 bit state
*/
static inline uint32_t sequence_gen_LTE_pr_memless_step_par_x2(uint32_t state)
{
// Perform XOR
uint32_t f = state ^ (state >> 1U) ^ (state >> 2U) ^ (state >> 3U);
// Prepare feedback
f = ((f & SEQUENCE_MASK) << (SEQUENCE_SEED_LEN - SEQUENCE_PAR_BITS));
// Insert feedback
state = (state >> SEQUENCE_PAR_BITS) ^ f;
return state;
}
/**
* Computes one step of the X2 sequence for 1bit
* @param state 32 bit current state
* @return new 32 bit state
*/
static inline uint32_t sequence_gen_LTE_pr_memless_step_x2(uint32_t state)
{
// Perform XOR
uint32_t f = state ^ (state >> 1U) ^ (state >> 2U) ^ (state >> 3U);
// Prepare feedback
f = ((f & 1U) << (SEQUENCE_SEED_LEN - 1U));
// Insert feedback
state = (state >> 1U) ^ f;
return state;
}
/**
* Static precomputed array x1 and x2
* ----------------------------------
*
* The pre-computation of the Pseudo-Random sequences is based in their linearity properties.
*
* Having two seeds seed_1 and seed_2 generate x2_1 and x2_2 respectively:
* seed_1 -> x2_1
* seed_2 -> x2_2
*
* Then, the linearity property satisfies:
* seed_1 ^ seed_2 -> x2_1 ^ x2_2
*
* Because of this, a different x2 can be pre-computed for each bit of the seed.
*
*/
static uint32_t sequence_x1_init = 0;
static uint32_t sequence_x2_init[SEQUENCE_SEED_LEN] = {};
/**
* C constructor, pre-computes X1 and X2 initial states and sequences
*/
__attribute__((constructor)) __attribute__((unused)) static void srslte_lte_pr_pregen()
{
// Compute transition step
sequence_x1_init = 1;
for (uint32_t n = 0; n < SEQUENCE_NC; n++) {
sequence_x1_init = sequence_gen_LTE_pr_memless_step_x1(sequence_x1_init);
} }
if (len > q->max_len) { // For each bit of the seed
ERROR("Error generating pseudo-random sequence: len %d is greater than allocated len %d\n", len, q->max_len); for (uint32_t i = 0; i < SEQUENCE_SEED_LEN; i++) {
return -1; // Compute transition step
sequence_x2_init[i] = 1U << i;
for (uint32_t n = 0; n < SEQUENCE_NC; n++) {
sequence_x2_init[i] = sequence_gen_LTE_pr_memless_step_x2(sequence_x2_init[i]);
}
} }
pthread_mutex_lock(&mutex); }
for (n = 0; n < 31; n++) { static void sequence_gen_LTE_pr_memless(uint8_t* pr, uint32_t len, uint32_t seed)
x2[n] = (seed >> n) & 0x1; {
int n = 0;
uint32_t x1 = sequence_x1_init; // X1 initial state is fix
uint32_t x2 = 0;
// Load X2 state
for (uint32_t i = 0; i < SEQUENCE_SEED_LEN; i++) {
if ((seed >> i) & 1U) {
x2 ^= sequence_x2_init[i];
}
} }
x1[0] = 1;
for (n = 0; n < Nc + len; n++) { // Parallel stage
x1[n + 31] = (x1[n + 3] + x1[n]) & 0x1; if (len > SEQUENCE_PAR_BITS) {
x2[n + 31] = (x2[n + 3] + x2[n + 2] + x2[n + 1] + x2[n]) & 0x1; for (; n < len - (SEQUENCE_PAR_BITS - 1); n += SEQUENCE_PAR_BITS) {
// XOR x1 and x2
uint32_t c = (uint32_t)(x1 ^ x2);
// Save state
for (uint32_t i = 0; i < SEQUENCE_PAR_BITS; i++) {
pr[n + i] = (uint8_t)((c >> i) & 1U);
} }
for (n = 0; n < len; n++) { // Parallel step
q->c[n] = (x1[n + Nc] + x2[n + Nc]) & 0x1; x1 = sequence_gen_LTE_pr_memless_step_par_x1(x1);
x2 = sequence_gen_LTE_pr_memless_step_par_x2(x2);
} }
pthread_mutex_unlock(&mutex); }
// Single step
for (; n < len; n++) {
// Save current state
pr[n] = (uint8_t)((x1 ^ x2) & 1U);
return 0; // Single step
x1 = sequence_gen_LTE_pr_memless_step_x1(x1);
x2 = sequence_gen_LTE_pr_memless_step_x2(x2);
}
} }
#else // static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
int srslte_sequence_set_LTE_pr(srslte_sequence_t* q, uint32_t len, uint32_t seed) int srslte_sequence_set_LTE_pr(srslte_sequence_t* q, uint32_t len, uint32_t seed)
{ {
int n;
uint32_t *x1, *x2;
if (len > q->max_len) { if (len > q->max_len) {
ERROR("Error generating pseudo-random sequence: len %d is greater than allocated len %d\n", len, q->max_len); ERROR("Error generating pseudo-random sequence: len %d is greater than allocated len %d\n", len, q->max_len);
return -1; return SRSLTE_ERROR;
} }
x1 = calloc(Nc + len + 31, sizeof(uint32_t)); sequence_gen_LTE_pr_memless(q->c, len, seed);
if (!x1) {
perror("calloc");
return -1;
}
x2 = calloc(Nc + len + 31, sizeof(uint32_t));
if (!x2) {
free(x1);
perror("calloc");
return -1;
}
for (n = 0; n < 31; n++) { return SRSLTE_SUCCESS;
x2[n] = (seed >> n) & 0x1; }
}
x1[0] = 1;
for (n = 0; n < Nc + len; n++) { static inline void
x1[n + 31] = (x1[n + 3] + x1[n]) & 0x1; sequence_generate_signed(const uint8_t* c_unpacked, int8_t* c_char, int16_t* c_short, float* c_float, uint32_t len)
x2[n + 31] = (x2[n + 3] + x2[n + 2] + +x2[n + 1] + x2[n]) & 0x1; {
}
int i = 0;
#ifdef LV_HAVE_SSE
__m128i* sse_c = (__m128i*)c_unpacked;
__m128i* sse_c_char = (__m128i*)c_char;
__m128i* sse_c_short = (__m128i*)c_short;
float* sse_c_float = c_float;
for (; i < ((int)len) - 15; i += 16) {
// Get bit mask
__m128i m8 = _mm_cmpgt_epi8(_mm_load_si128(sse_c), _mm_set1_epi8(0));
sse_c++;
// Generate blend masks
__m128i m16_1 = _mm_unpacklo_epi8(m8, m8);
__m128i m16_2 = _mm_unpackhi_epi8(m8, m8);
__m128 m32_1 = (__m128)_mm_unpacklo_epi8(m16_1, m16_1);
__m128 m32_2 = (__m128)_mm_unpackhi_epi8(m16_1, m16_1);
__m128 m32_3 = (__m128)_mm_unpacklo_epi8(m16_2, m16_2);
__m128 m32_4 = (__m128)_mm_unpackhi_epi8(m16_2, m16_2);
for (n = 0; n < len; n++) { // Generate int8 values
q->c[n] = (x1[n + Nc] + x2[n + Nc]) & 0x1; const __m128i bp = _mm_set1_epi8(+1);
const __m128i bn = _mm_set1_epi8(-1);
_mm_storeu_si128(sse_c_char, _mm_blendv_epi8(bp, bn, m8));
sse_c_char++;
// Generate int16 values
const __m128i sp = _mm_set1_epi16(+1);
const __m128i sn = _mm_set1_epi16(-1);
_mm_store_si128(sse_c_short++, _mm_blendv_epi8(sp, sn, m16_1));
_mm_store_si128(sse_c_short++, _mm_blendv_epi8(sp, sn, m16_2));
// Generate float values
const __m128 fp = _mm_set1_ps(+1);
const __m128 fn = _mm_set1_ps(-1);
_mm_store_ps(sse_c_float, _mm_blendv_ps(fp, fn, (__m128)m32_1));
sse_c_float += 4;
_mm_store_ps(sse_c_float, _mm_blendv_ps(fp, fn, (__m128)m32_2));
sse_c_float += 4;
_mm_store_ps(sse_c_float, _mm_blendv_ps(fp, fn, (__m128)m32_3));
sse_c_float += 4;
_mm_store_ps(sse_c_float, _mm_blendv_ps(fp, fn, (__m128)m32_4));
sse_c_float += 4;
} }
#endif /* LV_HAVE_SSE */
free(x1); for (; i < len; i++) {
free(x2); // Load signed
int8_t tt = (int8_t)(c_unpacked[i] ? -1 : +1);
return 0; // TYpecast conversion for each type
c_char[i] = tt;
c_short[i] = (int16_t)tt;
c_float[i] = (float)tt;
}
} }
#endif
int srslte_sequence_LTE_pr(srslte_sequence_t* q, uint32_t len, uint32_t seed) int srslte_sequence_LTE_pr(srslte_sequence_t* q, uint32_t len, uint32_t seed)
{ {
if (srslte_sequence_init(q, len)) { if (srslte_sequence_init(q, len)) {
return SRSLTE_ERROR; return SRSLTE_ERROR;
} }
q->cur_len = len; q->cur_len = len;
// Generate sequence
srslte_sequence_set_LTE_pr(q, len, seed); srslte_sequence_set_LTE_pr(q, len, seed);
// Pack PR sequence
srslte_bit_pack_vector(q->c, q->c_bytes, len); srslte_bit_pack_vector(q->c, q->c_bytes, len);
for (int i = 0; i < len; i++) {
q->c_float[i] = (1 - 2 * q->c[i]); // Generate signed type values
q->c_short[i] = (int16_t)q->c_float[i]; sequence_generate_signed(q->c, q->c_char, q->c_short, q->c_float, len);
q->c_char[i] = (int8_t)q->c_float[i];
;
}
return SRSLTE_SUCCESS; return SRSLTE_SUCCESS;
} }

Loading…
Cancel
Save