Solved bugs and compilation error in simd and vector_simd

master
Xavier Arteaga 7 years ago
parent 0c52c5651d
commit c41ad5453c

@ -226,7 +226,7 @@ static inline simd_f_t srslte_simd_f_mul(simd_f_t a, simd_f_t b) {
static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) { static inline simd_f_t srslte_simd_f_addsub(simd_f_t a, simd_f_t b) {
#ifdef LV_HAVE_AVX512 #ifdef LV_HAVE_AVX512
__m512 r = _mm512_add_ps(a, b); __m512 r = _mm512_add_ps(a, b);
return _mm512_mask_sub_ps(r, 0b1010101010101010, a, b); return _mm512_mask_sub_ps(r, 0b0101010101010101, a, b);
#else /* LV_HAVE_AVX512 */ #else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_addsub_ps(a, b); return _mm256_addsub_ps(a, b);
@ -642,10 +642,10 @@ static inline simd_s_t srslte_simd_s_load(int16_t *ptr) {
return _mm512_load_si512(ptr); return _mm512_load_si512(ptr);
#else /* LV_HAVE_AVX512 */ #else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_load_si256(ptr); return _mm256_load_si256((__m256i*) ptr);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_load_si128(ptr); return _mm_load_si128((__m128i*) ptr);
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */ #endif /* LV_HAVE_AVX512 */
@ -653,13 +653,13 @@ static inline simd_s_t srslte_simd_s_load(int16_t *ptr) {
static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) { static inline simd_s_t srslte_simd_s_loadu(int16_t *ptr) {
#ifdef LV_HAVE_AVX512 #ifdef LV_HAVE_AVX512
return _mm512_load_si512(ptr); return _mm512_loadu_si512(ptr);
#else /* LV_HAVE_AVX512 */ #else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
return _mm256_load_si256(ptr); return _mm256_loadu_si256((__m256i*) ptr);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
return _mm_load_si128(ptr); return _mm_loadu_si128((__m128i*) ptr);
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */ #endif /* LV_HAVE_AVX512 */
@ -670,10 +670,10 @@ static inline void srslte_simd_s_store(int16_t *ptr, simd_s_t simdreg) {
_mm512_store_si512(ptr, simdreg); _mm512_store_si512(ptr, simdreg);
#else /* LV_HAVE_AVX512 */ #else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
_mm256_store_si256(ptr, simdreg); _mm256_store_si256((__m256i*) ptr, simdreg);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
_mm_store_si128(ptr, simdreg); _mm_store_si128((__m128i*) ptr, simdreg);
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */ #endif /* LV_HAVE_AVX512 */
@ -684,10 +684,10 @@ static inline void srslte_simd_s_storeu(int16_t *ptr, simd_s_t simdreg) {
_mm512_storeu_si512(ptr, simdreg); _mm512_storeu_si512(ptr, simdreg);
#else /* LV_HAVE_AVX512 */ #else /* LV_HAVE_AVX512 */
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
_mm256_storeu_si256(ptr, simdreg); _mm256_storeu_si256((__m256i*) ptr, simdreg);
#else /* LV_HAVE_AVX2 */ #else /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
_mm_storeu_si128(ptr, simdreg); _mm_storeu_si128((__m128i*) ptr, simdreg);
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX512 */ #endif /* LV_HAVE_AVX512 */

@ -44,7 +44,7 @@ extern "C" {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0) #define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0)
#else /* LV_HAVE_SSE */ #else /* LV_HAVE_SSE */
#define SRSLTE_IS_ALIGNED(PTR) (true) #define SRSLTE_IS_ALIGNED(PTR) (1)
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
#endif /* LV_HAVE_AVX */ #endif /* LV_HAVE_AVX */
#endif /* LV_HAVE_AVX512 */ #endif /* LV_HAVE_AVX512 */

@ -45,7 +45,7 @@ bool mmse_solver = false;
bool verbose = false; bool verbose = false;
#define MAX_MSE (1e-3) #define MAX_MSE (1e-3)
#define NOF_REPETITIONS (1024*128) #define NOF_REPETITIONS (1024)
#define MAX_FUNCTIONS (64) #define MAX_FUNCTIONS (64)
#define MAX_BLOCKS (16) #define MAX_BLOCKS (16)
@ -70,7 +70,7 @@ bool verbose = false;
return passed;\ return passed;\
} }
#define MALLOC(TYPE, NAME) TYPE *NAME = srslte_vec_malloc(sizeof(TYPE)*block_size) #define MALLOC(TYPE, NAME) TYPE *NAME = malloc(sizeof(TYPE)*block_size)
static double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) { static double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) {
@ -339,7 +339,7 @@ TEST(srslte_vec_prod_conj_ccc,
TEST(srslte_vec_sc_prod_ccc, TEST(srslte_vec_sc_prod_ccc,
MALLOC(cf_t, x); MALLOC(cf_t, x);
MALLOC(cf_t, z); MALLOC(cf_t, z);
cf_t y = RANDOM_F(); cf_t y = RANDOM_CF();
cf_t gold; cf_t gold;
for (int i = 0; i < block_size; i++) { for (int i = 0; i < block_size; i++) {
@ -469,7 +469,7 @@ int main(int argc, char **argv) {
uint32_t func_count = 0; uint32_t func_count = 0;
bool passed = true; bool passed = true;
for (uint32_t block_size = 1; block_size <= 1024*16; block_size *= 2) { for (uint32_t block_size = 1; block_size <= 1024*8; block_size *= 2) {
func_count = 0; func_count = 0;
passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size); passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size);

@ -77,7 +77,7 @@ int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len) {
void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) { void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
int i = 0; int i = 0;
#ifdef SRSLTE_SIMD_S_SIZE #if SRSLTE_SIMD_S_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) { if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
simd_s_t a = srslte_simd_s_load(&x[i]); simd_s_t a = srslte_simd_s_load(&x[i]);
@ -106,7 +106,7 @@ void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) { void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
int i = 0; int i = 0;
#ifdef SRSLTE_SIMD_S_SIZE #if SRSLTE_SIMD_S_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) { if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
simd_s_t a = srslte_simd_s_load(&x[i]); simd_s_t a = srslte_simd_s_load(&x[i]);
@ -135,7 +135,7 @@ void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) { void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) {
int i = 0; int i = 0;
#ifdef SRSLTE_SIMD_S_SIZE #if SRSLTE_SIMD_S_SIZE
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) { if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
simd_s_t a = srslte_simd_s_load(&x[i]); simd_s_t a = srslte_simd_s_load(&x[i]);
@ -721,14 +721,14 @@ void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
} }
} else { } else {
for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) { for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) {
simd_f_t temp = srslte_simd_f_load((float *) &x[i]); simd_f_t temp = srslte_simd_f_loadu((float *) &x[i]);
simd_f_t m1 = srslte_simd_f_mul(hre, temp); simd_f_t m1 = srslte_simd_f_mul(hre, temp);
simd_f_t sw = srslte_simd_f_swap(temp); simd_f_t sw = srslte_simd_f_swap(temp);
simd_f_t m2 = srslte_simd_f_mul(him, sw); simd_f_t m2 = srslte_simd_f_mul(him, sw);
simd_f_t r = srslte_simd_f_addsub(m1, m2); simd_f_t r = srslte_simd_f_addsub(m1, m2);
srslte_simd_f_store((float *) &z[i], r); srslte_simd_f_storeu((float *) &z[i], r);
} }
} }
#endif #endif

Loading…
Cancel
Save