fixed some issues with AVX machines

master
Ismael Gomez 8 years ago
parent 3cbf403c54
commit 215dac6662

@ -37,19 +37,19 @@ extern "C" {
SRSLTE_API int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len); SRSLTE_API int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len);
SRSLTE_API int srslte_vec_dot_prod_sss_avx(short *x, short *y, uint32_t len); SRSLTE_API int srslte_vec_dot_prod_sss_avx2(short *x, short *y, uint32_t len);
SRSLTE_API void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len); SRSLTE_API void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len);
SRSLTE_API void srslte_vec_sum_sss_avx(short *x, short *y, short *z, uint32_t len); SRSLTE_API void srslte_vec_sum_sss_avx2(short *x, short *y, short *z, uint32_t len);
SRSLTE_API void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len); SRSLTE_API void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len);
SRSLTE_API void srslte_vec_sub_sss_avx(short *x, short *y, short *z, uint32_t len); SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len);
@ -57,12 +57,12 @@ SRSLTE_API void srslte_vec_sub_sss_avx(short *x, short *y, short *z, uint32_t le
SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len); SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len);
SRSLTE_API void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len); SRSLTE_API void srslte_vec_prod_sss_avx2(short *x, short *y, short *z, uint32_t len);
SRSLTE_API void srslte_vec_sc_div2_sss_sse(short *x, int n_rightshift, short *z, uint32_t len); SRSLTE_API void srslte_vec_sc_div2_sss_sse(short *x, int n_rightshift, short *z, uint32_t len);
SRSLTE_API void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len); SRSLTE_API void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len);

@ -122,7 +122,7 @@ void free37_sse(void *o) {
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
int decode37_avx2(void *o, uint8_t *symbols, uint8_t *data, uint32_t frame_length) { int decode37_avx2(void *o, uint8_t *symbols, uint8_t *data, uint32_t frame_length) {
srslte_viterbi_t *q = o; srslte_viterbi_t *q = o;
@ -333,7 +333,7 @@ int init37_neon(srslte_viterbi_t *q, int poly[3], uint32_t framebits, bool tail_
#endif #endif
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
int init37_avx2(srslte_viterbi_t *q, int poly[3], uint32_t framebits, bool tail_biting) { int init37_avx2(srslte_viterbi_t *q, int poly[3], uint32_t framebits, bool tail_biting) {
q->K = 7; q->K = 7;
q->R = 3; q->R = 3;
@ -383,7 +383,7 @@ int srslte_viterbi_init(srslte_viterbi_t *q, srslte_viterbi_type_t type, int pol
switch (type) { switch (type) {
case SRSLTE_VITERBI_37: case SRSLTE_VITERBI_37:
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
return init37_avx2(q, poly, max_frame_length, tail_bitting); return init37_avx2(q, poly, max_frame_length, tail_bitting);
#else #else
return init37_sse(q, poly, max_frame_length, tail_bitting); return init37_sse(q, poly, max_frame_length, tail_bitting);
@ -408,7 +408,7 @@ int srslte_viterbi_init_sse(srslte_viterbi_t *q, srslte_viterbi_type_t type, int
} }
#endif #endif
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
int srslte_viterbi_init_avx2(srslte_viterbi_t *q, srslte_viterbi_type_t type, int poly[3], uint32_t max_frame_length, bool tail_bitting) int srslte_viterbi_init_avx2(srslte_viterbi_t *q, srslte_viterbi_type_t type, int poly[3], uint32_t max_frame_length, bool tail_bitting)
{ {
return init37_avx2(q, poly, max_frame_length, tail_bitting); return init37_avx2(q, poly, max_frame_length, tail_bitting);

@ -14,7 +14,7 @@
//#define DEBUG //#define DEBUG
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_AVX2
#include <emmintrin.h> #include <emmintrin.h>
#include <tmmintrin.h> #include <tmmintrin.h>

@ -450,11 +450,11 @@ int srslte_pdsch_decode_multi(srslte_pdsch_t *q,
if (SRSLTE_VERBOSE_ISDEBUG()) { if (SRSLTE_VERBOSE_ISDEBUG()) {
DEBUG("SAVED FILE subframe.dat: received subframe symbols\n",0); DEBUG("SAVED FILE subframe.dat: received subframe symbols\n",0);
srslte_vec_save_file("subframe.dat", sf_symbols, SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t)); srslte_vec_save_file("subframe.dat", sf_symbols[0], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t));
DEBUG("SAVED FILE hest0.dat and hest1.dat: channel estimates for port 0 and port 1\n",0); DEBUG("SAVED FILE hest0.dat and hest1.dat: channel estimates for port 0 and port 1\n",0);
srslte_vec_save_file("hest0.dat", ce[0], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t)); srslte_vec_save_file("hest0.dat", ce[0][0], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t));
if (q->cell.nof_ports > 1) { if (q->cell.nof_ports > 1) {
srslte_vec_save_file("hest1.dat", ce[1], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t)); srslte_vec_save_file("hest1.dat", ce[1][0], SRSLTE_SF_LEN_RE(q->cell.nof_prb, q->cell.cp)*sizeof(cf_t));
} }
DEBUG("SAVED FILE pdsch_symbols.dat: symbols after equalization\n",0); DEBUG("SAVED FILE pdsch_symbols.dat: symbols after equalization\n",0);
srslte_vec_save_file("pdsch_symbols.dat", q->d, cfg->nbits.nof_re*sizeof(cf_t)); srslte_vec_save_file("pdsch_symbols.dat", q->d, cfg->nbits.nof_re*sizeof(cf_t));

@ -202,10 +202,12 @@ int main(int argc, char **argv) {
fprintf(stderr, "Error initiating soft buffer\n"); fprintf(stderr, "Error initiating soft buffer\n");
goto quit; goto quit;
} }
srslte_softbuffer_tx_reset(&softbuffer_tx);
if (srslte_softbuffer_rx_init(&softbuffer_rx, 100)) { if (srslte_softbuffer_rx_init(&softbuffer_rx, 100)) {
fprintf(stderr, "Error initiating soft buffer\n"); fprintf(stderr, "Error initiating soft buffer\n");
goto quit; goto quit;
} }
srslte_softbuffer_rx_reset(&softbuffer_rx);
uint32_t ntrials = 100; uint32_t ntrials = 100;

@ -110,8 +110,8 @@ void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len) {
} }
void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) { void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len) {
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
srslte_vec_sub_sss_avx(x, y, z, len); srslte_vec_sub_sss_avx2(x, y, z, len);
#else #else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
srslte_vec_sub_sss_sse(x, y, z, len); srslte_vec_sub_sss_sse(x, y, z, len);
@ -140,8 +140,8 @@ void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len) {
} }
void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len) { void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len) {
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
srslte_vec_sum_sss_avx(x, y, z, len); srslte_vec_sum_sss_avx2(x, y, z, len);
#else #else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
srslte_vec_sum_sss_sse(x, y, z, len); srslte_vec_sum_sss_sse(x, y, z, len);
@ -212,8 +212,8 @@ void srslte_vec_sc_prod_sfs(short *x, float h, short *z, uint32_t len) {
} }
void srslte_vec_sc_div2_sss(short *x, int n_rightshift, short *z, uint32_t len) { void srslte_vec_sc_div2_sss(short *x, int n_rightshift, short *z, uint32_t len) {
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
srslte_vec_sc_div2_sss_avx(x, n_rightshift, z, len); srslte_vec_sc_div2_sss_avx2(x, n_rightshift, z, len);
#else #else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
srslte_vec_sc_div2_sss_sse(x, n_rightshift, z, len); srslte_vec_sc_div2_sss_sse(x, n_rightshift, z, len);
@ -345,14 +345,14 @@ void srslte_vec_deinterleave_real_cf(cf_t *x, float *real, uint32_t len) {
#endif #endif
} }
/* Note: We align memory to 32 bytes (for AVX compatibility) /* Note: We align memory to 32 bytes (for AVX2 compatibility)
* because in some cases volk can incorrectly detect the architecture. * because in some cases volk can incorrectly detect the architecture.
* This could be inefficient for SSE or non-SIMD platforms but shouldn't * This could be inefficient for SSE or non-SIMD platforms but shouldn't
* be a huge problem. * be a huge problem.
*/ */
void *srslte_vec_malloc(uint32_t size) { void *srslte_vec_malloc(uint32_t size) {
void *ptr; void *ptr;
if (posix_memalign(&ptr,32,size)) { if (posix_memalign(&ptr,256,size)) {
return NULL; return NULL;
} else { } else {
return ptr; return ptr;
@ -364,7 +364,7 @@ void *srslte_vec_realloc(void *ptr, uint32_t old_size, uint32_t new_size) {
return realloc(ptr, new_size); return realloc(ptr, new_size);
#else #else
void *new_ptr; void *new_ptr;
if (posix_memalign(&new_ptr,volk_get_alignment(),new_size)) { if (posix_memalign(&new_ptr,256,new_size)) {
return NULL; return NULL;
} else { } else {
memcpy(new_ptr, ptr, old_size); memcpy(new_ptr, ptr, old_size);
@ -520,8 +520,8 @@ void srslte_vec_prod_fff(float *x, float *y, float *z, uint32_t len) {
} }
void srslte_vec_prod_sss(short *x, short *y, short *z, uint32_t len) { void srslte_vec_prod_sss(short *x, short *y, short *z, uint32_t len) {
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
srslte_vec_prod_sss_avx(x,y,z,len); srslte_vec_prod_sss_avx2(x,y,z,len);
#else #else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
srslte_vec_prod_sss_sse(x,y,z,len); srslte_vec_prod_sss_sse(x,y,z,len);
@ -661,8 +661,8 @@ float srslte_vec_dot_prod_fff(float *x, float *y, uint32_t len) {
} }
int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len) { int32_t srslte_vec_dot_prod_sss(int16_t *x, int16_t *y, uint32_t len) {
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
return srslte_vec_dot_prod_sss_avx(x, y, len); return srslte_vec_dot_prod_sss_avx2(x, y, len);
#else #else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return srslte_vec_dot_prod_sss_sse(x, y, len); return srslte_vec_dot_prod_sss_sse(x, y, len);

@ -87,10 +87,10 @@ int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len)
} }
int srslte_vec_dot_prod_sss_avx(short *x, short *y, uint32_t len) int srslte_vec_dot_prod_sss_avx2(short *x, short *y, uint32_t len)
{ {
int result = 0; int result = 0;
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
unsigned int number = 0; unsigned int number = 0;
const unsigned int points = len / 16; const unsigned int points = len / 16;
@ -160,9 +160,9 @@ void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len)
} }
void srslte_vec_sum_sss_avx(short *x, short *y, short *z, uint32_t len) void srslte_vec_sum_sss_avx2(short *x, short *y, short *z, uint32_t len)
{ {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_AVX2
unsigned int number = 0; unsigned int number = 0;
const unsigned int points = len / 16; const unsigned int points = len / 16;
@ -225,9 +225,9 @@ void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len)
#endif #endif
} }
void srslte_vec_sub_sss_avx(short *x, short *y, short *z, uint32_t len) void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len)
{ {
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
unsigned int number = 0; unsigned int number = 0;
const unsigned int points = len / 16; const unsigned int points = len / 16;
@ -292,9 +292,9 @@ void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len)
#endif #endif
} }
void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len) void srslte_vec_prod_sss_avx2(short *x, short *y, short *z, uint32_t len)
{ {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_AVX2
unsigned int number = 0; unsigned int number = 0;
const unsigned int points = len / 16; const unsigned int points = len / 16;
@ -359,9 +359,9 @@ void srslte_vec_sc_div2_sss_sse(short *x, int k, short *z, uint32_t len)
#endif #endif
} }
void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len) void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)
{ {
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX2
unsigned int number = 0; unsigned int number = 0;
const unsigned int points = len / 16; const unsigned int points = len / 16;
@ -394,7 +394,11 @@ void srslte_vec_sc_div2_sss_avx(short *x, int k, short *z, uint32_t len)
/* No improvement with AVX */ /* No improvement with AVX */
void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len) void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len)
{ {
#ifndef DEBUG_MODE #ifdef DEBUG_MODE
for (int i=0;i<len;i++) {
y[lut[i]] = x[i];
}
#else
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
unsigned int number = 0; unsigned int number = 0;
const unsigned int points = len / 8; const unsigned int points = len / 8;

@ -46,7 +46,7 @@ endif (RPATH)
######################################################################## ########################################################################
if (NOT ${BUILDUE_CMD} STREQUAL "") if (NOT ${BUILDUE_CMD} STREQUAL "")
message(STATUS "Added custom post-build-UE command: ${BUILDUE_CMD}") message(STATUS "Added custom post-build-UE command: ${BUILDUE_CMD}")
add_custom_command(TARGET ue POST_BUILD COMMAND ${BUILDUE_CMD}) add_custom_command(TARGET srsue POST_BUILD COMMAND ${BUILDUE_CMD})
else(NOT ${BUILDUE_CMD} STREQUAL "") else(NOT ${BUILDUE_CMD} STREQUAL "")
message(STATUS "No post-build-UE command defined") message(STATUS "No post-build-UE command defined")
endif (NOT ${BUILDUE_CMD} STREQUAL "") endif (NOT ${BUILDUE_CMD} STREQUAL "")

Loading…
Cancel
Save