From c9f6bfccd47d6dfcda2930e7cb81465e38fb3b9d Mon Sep 17 00:00:00 2001 From: Xavier Arteaga Date: Mon, 25 Sep 2017 13:19:34 +0200 Subject: [PATCH] Refactored vector library with SIMD independent architecture inline functions test-benchmark --- lib/include/srslte/phy/utils/vector.h | 8 +- lib/include/srslte/phy/utils/vector_simd.h | 65 +- lib/src/phy/utils/test/CMakeLists.txt | 3 + lib/src/phy/utils/test/vector_test.c | 555 +++++++++++ lib/src/phy/utils/vector.c | 228 +---- lib/src/phy/utils/vector_simd.c | 1018 ++++++++++---------- 6 files changed, 1138 insertions(+), 739 deletions(-) create mode 100644 lib/src/phy/utils/test/vector_test.c diff --git a/lib/include/srslte/phy/utils/vector.h b/lib/include/srslte/phy/utils/vector.h index 4a55d18b6..0fadfb334 100644 --- a/lib/include/srslte/phy/utils/vector.h +++ b/lib/include/srslte/phy/utils/vector.h @@ -80,8 +80,8 @@ SRSLTE_API void srslte_vec_load_file(char *filename, void *buffer, uint32_t len) SRSLTE_API void srslte_vec_sum_ch(uint8_t *x, uint8_t *y, char *z, uint32_t len); SRSLTE_API void srslte_vec_sum_fff(float *x, float *y, float *z, uint32_t len); SRSLTE_API void srslte_vec_sum_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len); -SRSLTE_API void srslte_vec_sub_sss(short *x, short *y, short *z, uint32_t len); -SRSLTE_API void srslte_vec_sum_sss(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_sub_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len); +SRSLTE_API void srslte_vec_sum_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len); /* substract two vectors z=x-y */ SRSLTE_API void srslte_vec_sub_fff(float *x, float *y, float *z, uint32_t len); @@ -91,7 +91,7 @@ SRSLTE_API void srslte_vec_sub_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len); SRSLTE_API void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float coeff, uint32_t len); /* Square distance */ -SRSLTE_API void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t npoints); +//SRSLTE_API void srslte_vec_square_dist(cf_t symbol, cf_t *points, float *distance, uint32_t npoints); /* scalar addition */ SRSLTE_API void srslte_vec_sc_add_fff(float *x, float h, float *z, uint32_t len); @@ -132,7 +132,7 @@ SRSLTE_API void srslte_vec_prod_conj_ccc(cf_t *x, cf_t *y, cf_t *z, uint32_t len /* real vector product (element-wise) */ SRSLTE_API void srslte_vec_prod_fff(float *x, float *y, float *z, uint32_t len); -SRSLTE_API void srslte_vec_prod_sss(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_sss(int16_t *x, int16_t *y, int16_t *z, uint32_t len); /* Dot-product */ SRSLTE_API cf_t srslte_vec_dot_prod_cfc(cf_t *x, float *y, uint32_t len); diff --git a/lib/include/srslte/phy/utils/vector_simd.h b/lib/include/srslte/phy/utils/vector_simd.h index 1010cbed6..8ea2ce9bc 100644 --- a/lib/include/srslte/phy/utils/vector_simd.h +++ b/lib/include/srslte/phy/utils/vector_simd.h @@ -35,47 +35,66 @@ extern "C" { #include #include "srslte/config.h" -SRSLTE_API int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len); +#ifdef LV_HAVE_AVX512 +#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x3F) == 0) +#else /* LV_HAVE_AVX512 */ +#ifdef LV_HAVE_AVX +#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x1F) == 0) +#else /* LV_HAVE_AVX */ +#ifdef LV_HAVE_SSE +#define SRSLTE_IS_ALIGNED(PTR) (((size_t)(PTR) & 0x0F) == 0) +#else /* LV_HAVE_SSE */ +#define SRSLTE_IS_ALIGNED(PTR) (true) +#endif /* LV_HAVE_SSE */ +#endif /* LV_HAVE_AVX */ +#endif /* LV_HAVE_AVX512 */ + +SRSLTE_API int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len); + +SRSLTE_API void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len); + +SRSLTE_API void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len); -SRSLTE_API int srslte_vec_dot_prod_sss_avx2(short *x, short *y, uint32_t len); - -SRSLTE_API void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len); -SRSLTE_API void srslte_vec_sum_sss_avx2(short *x, short *y, short *z, uint32_t len); +SRSLTE_API cf_t srslte_vec_acc_cc_simd(cf_t *x, int len); -SRSLTE_API void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_add_fff_simd(float *x, float *y, float *z, int len); -SRSLTE_API void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_sub_fff_simd(float *x, float *y, float *z, int len); -SRSLTE_API void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len); +SRSLTE_API void srslte_vec_sc_prod_fff_simd(float *x, float h, float *z, int len); -SRSLTE_API void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len); +SRSLTE_API void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len); -SRSLTE_API void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len); -SRSLTE_API void srslte_vec_sub_fff_avx(float *x, float *y, float *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len); -SRSLTE_API void srslte_vec_sc_prod_fff_sse(float *x, float h, float *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len); -SRSLTE_API void srslte_vec_sc_prod_ccc_sse(cf_t *x, cf_t h, cf_t *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len); -SRSLTE_API void srslte_vec_prod_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len); - -SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, int16_t *b_im, int16_t *r_re, + int16_t *r_im, int len); -SRSLTE_API void srslte_vec_prod_sss_avx2(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len); -SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_sse(cf_t *x, cf_t *y, uint32_t len); +SRSLTE_API cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len); -SRSLTE_API void srslte_vec_prod_conj_ccc_sse(cf_t *x,cf_t *y, cf_t *z, uint32_t len); +SRSLTE_API cf_t srslte_vec_dot_prod_ccc_simd(cf_t *x, cf_t *y, int len); SRSLTE_API cf_t srslte_vec_dot_prod_ccc_sse(cf_t *x, cf_t *y, uint32_t len); +SRSLTE_API c16_t srslte_vec_dot_prod_ccc_c16i_simd(c16_t *x, c16_t *y, int len); + SRSLTE_API void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len); -SRSLTE_API void srslte_vec_abs_square_cf_sse(cf_t *x, float *z, uint32_t len); +SRSLTE_API void srslte_vec_abs_cf_simd(cf_t *x, float *z, int len); + +SRSLTE_API void srslte_vec_abs_square_cf_simd(cf_t *x, float *z, int len); -SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len); +SRSLTE_API void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len); SRSLTE_API void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len); @@ -93,7 +112,9 @@ SRSLTE_API void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, SRSLTE_API void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len); -SRSLTE_API void srslte_vec_sc_prod_cfc_avx(const cf_t *x,const float h,cf_t *y,const uint32_t len); +SRSLTE_API void srslte_vec_sc_prod_cfc_simd(const cf_t *x,const float h,cf_t *y,const int len); + +SRSLTE_API void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len); #ifdef __cplusplus } diff --git a/lib/src/phy/utils/test/CMakeLists.txt b/lib/src/phy/utils/test/CMakeLists.txt index 4dccbf2a0..76df7ac59 100644 --- a/lib/src/phy/utils/test/CMakeLists.txt +++ b/lib/src/phy/utils/test/CMakeLists.txt @@ -42,3 +42,6 @@ target_link_libraries(algebra_test srslte_phy) add_test(algebra_2x2_zf_solver_test algebra_test -z) add_test(algebra_2x2_mmse_solver_test algebra_test -m) + +add_executable(vector_test vector_test.c) +target_link_libraries(vector_test srslte_phy) \ No newline at end of file diff --git a/lib/src/phy/utils/test/vector_test.c b/lib/src/phy/utils/test/vector_test.c new file mode 100644 index 000000000..e781d05b9 --- /dev/null +++ b/lib/src/phy/utils/test/vector_test.c @@ -0,0 +1,555 @@ +/** + * + * \section COPYRIGHT + * + * Copyright 2013-2015 Software Radio Systems Limited + * + * \section LICENSE + * + * This file is part of the srsLTE library. + * + * srsLTE is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * srsLTE is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * A copy of the GNU Affero General Public License can be found in + * the LICENSE file in the top-level directory of this distribution + * and at http://www.gnu.org/licenses/. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "srslte/phy/utils/mat.h" +#include "srslte/phy/utils/simd.h" +#include "srslte/phy/utils/vector.h" + + +bool zf_solver = false; +bool mmse_solver = false; +bool verbose = false; + +#define MAX_MSE (1e-3) +#define NOF_REPETITIONS (1024*128) +#define MAX_FUNCTIONS (64) +#define MAX_BLOCKS (16) + +#define RANDOM_F() ((float)rand())/((float)RAND_MAX) +#define RANDOM_S() ((int16_t)(rand() && 0x800F)) +#define RANDOM_CF() (RANDOM_F() + _Complex_I*RANDOM_F()) + +#define TEST_CALL(TEST_CODE) gettimeofday(&start, NULL);\ + for (int i = 0; i < NOF_REPETITIONS; i++){TEST_CODE;}\ + gettimeofday(&end, NULL); \ + *timing = elapsed_us(&start, &end); + +#define TEST(X, CODE) static bool test_##X (char *func_name, double *timing, uint32_t block_size) {\ + struct timeval start, end;\ + float mse = 0.0f;\ + bool passed;\ + strncpy(func_name, #X, 32);\ + CODE;\ + passed = (mse < MAX_MSE);\ + printf("%32s (%5d) ... %7.1f MSamp/s ... %3s Passed\n", func_name, block_size, \ + (double) block_size*NOF_REPETITIONS/ *timing, passed?"":"Not");\ + return passed;\ +} + +#define MALLOC(TYPE, NAME) TYPE *NAME = srslte_vec_malloc(sizeof(TYPE)*block_size) + + +static double elapsed_us(struct timeval *ts_start, struct timeval *ts_end) { + if (ts_end->tv_usec > ts_start->tv_usec) { + return ((double) ts_end->tv_sec - (double) ts_start->tv_sec) * 1000000 + + (double) ts_end->tv_usec - (double) ts_start->tv_usec; + } else { + return ((double) ts_end->tv_sec - (double) ts_start->tv_sec - 1) * 1000000 + + ((double) ts_end->tv_usec + 1000000) - (double) ts_start->tv_usec; + } +} + +float squared_error (cf_t a, cf_t b) { + float diff_re = __real__ a - __real__ b; + float diff_im = __imag__ a - __imag__ b; + return diff_re*diff_re + diff_im*diff_im; +} + +TEST(srslte_vec_dot_prod_sss, + MALLOC(int16_t, x); + MALLOC(int16_t, y); + int16_t z; + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_S(); + y[i] = RANDOM_S(); + } + + TEST_CALL(z = srslte_vec_dot_prod_sss(x, y, block_size)) + + for (int i = 0; i < block_size; i++) { + gold += x[i] * y[i]; + } + + mse += cabsf(gold - z) / cabsf(gold); + + free(x); + free(y); +) + +TEST(srslte_vec_sum_sss, + MALLOC(int16_t, x); + MALLOC(int16_t, y); + MALLOC(int16_t, z); + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_S(); + y[i] = RANDOM_S(); + } + + TEST_CALL(srslte_vec_sum_sss(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] + y[i]; + mse += cabsf(gold - z[i]); + } + + free(x); + free(y); + free(z); +) + +TEST(srslte_vec_sub_sss, + MALLOC(int16_t, x); + MALLOC(int16_t, y); + MALLOC(int16_t, z); + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_S(); + y[i] = RANDOM_S(); + } + + TEST_CALL(srslte_vec_sub_sss(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] - y[i]; + mse += cabsf(gold - z[i]); + } + + free(x); + free(y); + free(z); +) + +TEST(srslte_vec_prod_sss, + MALLOC(int16_t, x); + MALLOC(int16_t, y); + MALLOC(int16_t, z); + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_S(); + y[i] = RANDOM_S(); + } + + TEST_CALL(srslte_vec_prod_sss(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] * y[i]; + mse += cabsf(gold - z[i]); + } + + free(x); + free(y); + free(z); +) + +TEST(srslte_vec_acc_cc, + MALLOC(cf_t, x); + cf_t z; + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_F(); + } + + TEST_CALL(z = srslte_vec_acc_cc(x, block_size)) + + for (int i = 0; i < block_size; i++) { + gold += x[i]; + } + + mse += cabsf(gold - z)/cabsf(gold); + + free(x); +) + + +TEST(srslte_vec_sum_fff, + MALLOC(float, x); + MALLOC(float, y); + MALLOC(float, z); + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_F(); + y[i] = RANDOM_F(); + } + + TEST_CALL(srslte_vec_sum_fff(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] + y[i]; + mse += cabsf(gold - z[i]); + } + + free(x); + free(y); +) + +TEST(srslte_vec_sub_fff, + MALLOC(float, x); + MALLOC(float, y); + MALLOC(float, z); + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_F(); + y[i] = RANDOM_F(); + } + + TEST_CALL(srslte_vec_sub_fff(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] - y[i]; + mse += cabsf(gold - z[i]); + } + + free(x); + free(y); +) + +TEST(srslte_vec_dot_prod_ccc, + MALLOC(cf_t, x); + MALLOC(cf_t, y); + cf_t z; + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + y[i] = RANDOM_CF(); + } + + TEST_CALL(z = srslte_vec_dot_prod_ccc(x, y, block_size)) + + for (int i = 0; i < block_size; i++) { + gold += x[i] * y[i]; + } + + mse = cabsf(gold - z) / cabsf(gold); + + free(x); + free(y); +) + +TEST(srslte_vec_dot_prod_conj_ccc, + MALLOC(cf_t, x); + MALLOC(cf_t, y); + cf_t z; + + cf_t gold = 0.0f; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + y[i] = RANDOM_CF(); + } + + TEST_CALL(z = srslte_vec_dot_prod_conj_ccc(x, y, block_size)) + + for (int i = 0; i < block_size; i++) { + gold += x[i] * conjf(y[i]); + } + + mse = cabsf(gold - z) / cabsf(gold); + + free(x); + free(y); +) + +TEST(srslte_vec_prod_ccc, + MALLOC(cf_t, x); + MALLOC(cf_t, y); + MALLOC(cf_t, z); + + cf_t gold; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + y[i] = RANDOM_CF(); + } + + TEST_CALL(srslte_vec_prod_ccc(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] * y[i]; + mse += cabsf(gold - z[i]); + } + + free(x); + free(z); +) + +TEST(srslte_vec_prod_conj_ccc, + MALLOC(cf_t, x); + MALLOC(cf_t, y); + MALLOC(cf_t, z); + + cf_t gold; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + y[i] = RANDOM_CF(); + } + + TEST_CALL(srslte_vec_prod_conj_ccc(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] * conjf(y[i]); + mse += cabsf(gold - z[i]); + } + + free(x); + free(z); +) + +TEST(srslte_vec_sc_prod_ccc, + MALLOC(cf_t, x); + MALLOC(cf_t, z); + cf_t y = RANDOM_F(); + + cf_t gold; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + } + + TEST_CALL(srslte_vec_sc_prod_ccc(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] * y; + mse += cabsf(gold - z[i]); + } + + free(x); + free(z); +) + +TEST(srslte_vec_prod_fff, + MALLOC(float, x); + MALLOC(float, y); + MALLOC(float, z); + + cf_t gold; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + y[i] = RANDOM_CF(); + } + + TEST_CALL(srslte_vec_prod_fff(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] * y[i]; + mse += cabsf(gold - z[i]); + } + + free(x); + free(z); +) + +TEST(srslte_vec_sc_prod_fff, + MALLOC(float, x); + MALLOC(float, z); + float y = RANDOM_F(); + + float gold; + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + } + + TEST_CALL(srslte_vec_sc_prod_fff(x, y, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] * y; + mse += cabsf(gold - z[i]); + } + + free(x); + free(z); +) + +TEST(srslte_vec_abs_cf, + MALLOC(cf_t, x); + MALLOC(float, z); + float gold; + + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + } + + TEST_CALL(srslte_vec_abs_cf(x, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = sqrtf(crealf(x[i]) * crealf(x[i]) + cimagf(x[i])*cimagf(x[i])); + mse += cabsf(gold - z[i]); + } + + free(x); + free(z); +) + +TEST(srslte_vec_abs_square_cf, + MALLOC(cf_t, x); + MALLOC(float, z); + float gold; + + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + } + + TEST_CALL(srslte_vec_abs_square_cf(x, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = crealf(x[i]) * crealf(x[i]) + cimagf(x[i])*cimagf(x[i]); + mse += cabsf(gold - z[i]); + } + + free(x); + free(z); +) + +TEST(srslte_vec_sc_prod_cfc, + MALLOC(cf_t, x); + MALLOC(cf_t, z); + cf_t gold; + float h = RANDOM_F(); + + for (int i = 0; i < block_size; i++) { + x[i] = RANDOM_CF(); + } + + TEST_CALL(srslte_vec_sc_prod_cfc(x, h, z, block_size)) + + for (int i = 0; i < block_size; i++) { + gold = x[i] * h; + mse += cabsf(gold - z[i]); + } + + free(x); + free(z); +) + +int main(int argc, char **argv) { + char func_names[MAX_FUNCTIONS][32]; + double timmings[MAX_FUNCTIONS][MAX_BLOCKS]; + uint32_t sizes[32]; + uint32_t size_count = 0; + uint32_t func_count = 0; + bool passed = true; + + for (uint32_t block_size = 1; block_size <= 1024*16; block_size *= 2) { + func_count = 0; + + passed &= test_srslte_vec_dot_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_sum_sss(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_sub_sss(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_prod_sss(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_acc_cc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_sum_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_sub_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_dot_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_dot_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_prod_conj_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_sc_prod_ccc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_sc_prod_fff(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_abs_cf(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_abs_square_cf(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + passed &= test_srslte_vec_sc_prod_cfc(func_names[func_count], &timmings[func_count][size_count], block_size); + func_count++; + + sizes[size_count] = block_size; + size_count++; + } + + printf("\n"); + printf("%32s |", "Subroutine/MSps"); + for (int i = 0; i < size_count; i++) { + printf(" %7d", sizes[i]); + } + printf(" |\n"); + + for (int j = 0; j < 32; j++) { + printf("-"); + } + printf("-+-"); + for (int j = 0; j < size_count; j++) { + printf("--------"); + } + printf("-|\n"); + + for (int i = 0; i < func_count; i++) { + printf("%32s | ", func_names[i]); + for (int j = 0; j < size_count; j++) { + printf(" %7.1f", (double) NOF_REPETITIONS*(double)sizes[j]/timmings[i][j]); + } + printf(" |\n"); + } + + return (passed)?SRSLTE_SUCCESS:SRSLTE_ERROR; +} diff --git a/lib/src/phy/utils/vector.c b/lib/src/phy/utils/vector.c index 917810e92..cb21f24f1 100644 --- a/lib/src/phy/utils/vector.c +++ b/lib/src/phy/utils/vector.c @@ -36,25 +36,6 @@ #include "srslte/phy/utils/bit.h" -#ifdef LV_HAVE_SSE -#include -#endif - -#ifdef LV_HAVE_AVX -#include -#endif - - -#ifdef HAVE_VOLK -#include "volk/volk.h" -#endif - -#ifdef DEBUG_MODE -#warning FIXME: Disabling SSE/AVX vector code -#undef LV_HAVE_SSE -#undef LV_HAVE_AVX -#endif - int srslte_vec_acc_ii(int *x, uint32_t len) { int i; @@ -88,51 +69,25 @@ void srslte_vec_ema_filter(cf_t *new_data, cf_t *average, cf_t *output, float co } cf_t srslte_vec_acc_cc(cf_t *x, uint32_t len) { - int i; - cf_t z=0; - for (i=0;i #include #include #include #include - -#include "srslte/phy/utils/vector_simd.h" - #include #include -#ifdef LV_HAVE_SSE -#include -#endif - -#ifdef LV_HAVE_AVX -#include -#endif - - -int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len) -{ - int result = 0; -#ifdef LV_HAVE_SSE - unsigned int number = 0; - const unsigned int points = len / 8; +#include +#include "srslte/phy/utils/vector_simd.h" +#include "srslte/phy/utils/simd.h" - const __m128i* xPtr = (const __m128i*) x; - const __m128i* yPtr = (const __m128i*) y; - - __m128i dotProdVal = _mm_setzero_si128(); - __m128i xVal, yVal, zVal; - for(;number < points; number++){ +int srslte_vec_dot_prod_sss_simd(int16_t *x, int16_t *y, int len) { + int i = 0; + int result = 0; +#if SRSLTE_SIMD_S_SIZE + simd_s_t simd_dotProdVal = srslte_simd_s_zero(); + if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y)) { + for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { + simd_s_t a = srslte_simd_s_load(&x[i]); + simd_s_t b = srslte_simd_s_load(&y[i]); - xVal = _mm_load_si128(xPtr); - yVal = _mm_loadu_si128(yPtr); + simd_s_t z = srslte_simd_s_mul(a, b); - zVal = _mm_mullo_epi16(xVal, yVal); + simd_dotProdVal = srslte_simd_s_add(simd_dotProdVal, z); + } + } else { + for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { + simd_s_t a = srslte_simd_s_loadu(&x[i]); + simd_s_t b = srslte_simd_s_loadu(&y[i]); - dotProdVal = _mm_add_epi16(dotProdVal, zVal); + simd_s_t z = srslte_simd_s_mul(a, b); - xPtr ++; - yPtr ++; + simd_dotProdVal = srslte_simd_s_add(simd_dotProdVal, z); + } } - - short dotProdVector[8]; - _mm_store_si128((__m128i*) dotProdVector, dotProdVal); - for (int i=0;i<8;i++) { - result += dotProdVector[i]; + __attribute__ ((aligned (SRSLTE_SIMD_S_SIZE*2))) short dotProdVector[SRSLTE_SIMD_S_SIZE]; + srslte_simd_s_store(dotProdVector, simd_dotProdVal); + for (int k = 0; k < SRSLTE_SIMD_S_SIZE; k++) { + result += dotProdVector[k]; } +#endif /* SRSLTE_SIMD_S_SIZE */ - number = points * 8; - for(;number < len; number++){ - result += (x[number] * y[number]); + for(; i < len; i++){ + result += (x[i] * y[i]); } - -#endif - return result; -} - -int srslte_vec_dot_prod_sss_avx2(short *x, short *y, uint32_t len) -{ - int result = 0; -#ifdef LV_HAVE_AVX2 - unsigned int number = 0; - const unsigned int points = len / 16; - - const __m256i* xPtr = (const __m256i*) x; - const __m256i* yPtr = (const __m256i*) y; - - __m256i dotProdVal = _mm256_setzero_si256(); - - __m256i xVal, yVal, zVal; - for(;number < points; number++){ - - xVal = _mm256_load_si256(xPtr); - yVal = _mm256_loadu_si256(yPtr); - zVal = _mm256_mullo_epi16(xVal, yVal); - dotProdVal = _mm256_add_epi16(dotProdVal, zVal); - xPtr ++; - yPtr ++; - } - - __attribute__ ((aligned (256))) short dotProdVector[16]; - _mm256_store_si256((__m256i*) dotProdVector, dotProdVal); - for (int i=0;i<16;i++) { - result += dotProdVector[i]; - } - - number = points * 16; - for(;number < len; number++){ - result += (x[number] * y[number]); - } - -#endif return result; } +void srslte_vec_sum_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) { + int i = 0; +#ifdef SRSLTE_SIMD_S_SIZE + if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) { + for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { + simd_s_t a = srslte_simd_s_load(&x[i]); + simd_s_t b = srslte_simd_s_load(&y[i]); + simd_s_t r = srslte_simd_s_add(a, b); -void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len) -{ -#ifdef LV_HAVE_SSE - unsigned int number = 0; - const unsigned int points = len / 8; - - const __m128i* xPtr = (const __m128i*) x; - const __m128i* yPtr = (const __m128i*) y; - __m128i* zPtr = (__m128i*) z; - - __m128i xVal, yVal, zVal; - for(;number < points; number++){ - - xVal = _mm_load_si128(xPtr); - yVal = _mm_load_si128(yPtr); - - zVal = _mm_add_epi16(xVal, yVal); - - _mm_store_si128(zPtr, zVal); - - xPtr ++; - yPtr ++; - zPtr ++; - } - - number = points * 8; - for(;number < len; number++){ - z[number] = x[number] + y[number]; - } -#endif - -} - -void srslte_vec_sum_sss_avx2(short *x, short *y, short *z, uint32_t len) -{ -#ifdef LV_HAVE_AVX2 - unsigned int number = 0; - const unsigned int points = len / 16; - - const __m256i* xPtr = (const __m256i*) x; - const __m256i* yPtr = (const __m256i*) y; - __m256i* zPtr = (__m256i*) z; - - __m256i xVal, yVal, zVal; - for(;number < points; number++){ - - xVal = _mm256_load_si256(xPtr); - yVal = _mm256_loadu_si256(yPtr); - - zVal = _mm256_add_epi16(xVal, yVal); - _mm256_store_si256(zPtr, zVal); - - xPtr ++; - yPtr ++; - zPtr ++; - } - - number = points * 16; - for(;number < len; number++){ - z[number] = x[number] + y[number]; - } -#endif - -} - - -void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len) -{ -#ifdef LV_HAVE_SSE - unsigned int number = 0; - const unsigned int points = len / 8; - - const __m128i* xPtr = (const __m128i*) x; - const __m128i* yPtr = (const __m128i*) y; - __m128i* zPtr = (__m128i*) z; - - __m128i xVal, yVal, zVal; - for(;number < points; number++){ - - xVal = _mm_load_si128(xPtr); - yVal = _mm_load_si128(yPtr); - - zVal = _mm_sub_epi16(xVal, yVal); - - _mm_store_si128(zPtr, zVal); - - xPtr ++; - yPtr ++; - zPtr ++; - } - - number = points * 8; - for(;number < len; number++){ - z[number] = x[number] - y[number]; - } -#endif -} - -void srslte_vec_sub_sss_avx2(short *x, short *y, short *z, uint32_t len) -{ -#ifdef LV_HAVE_AVX2 - unsigned int number = 0; - const unsigned int points = len / 16; - - const __m256i* xPtr = (const __m256i*) x; - const __m256i* yPtr = (const __m256i*) y; - __m256i* zPtr = (__m256i*) z; - - __m256i xVal, yVal, zVal; - for(;number < points; number++){ - - xVal = _mm256_load_si256(xPtr); - yVal = _mm256_loadu_si256(yPtr); - - zVal = _mm256_sub_epi16(xVal, yVal); + srslte_simd_s_store(&z[i], r); + } + } else { + for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { + simd_s_t a = srslte_simd_s_loadu(&x[i]); + simd_s_t b = srslte_simd_s_loadu(&y[i]); - _mm256_store_si256(zPtr, zVal); + simd_s_t r = srslte_simd_s_add(a, b); - xPtr ++; - yPtr ++; - zPtr ++; + srslte_simd_s_storeu(&z[i], r); + } } +#endif /* SRSLTE_SIMD_S_SIZE */ - number = points * 16; - for(;number < len; number++){ - z[number] = x[number] - y[number]; + for(; i < len; i++){ + z[i] = x[i] + y[i]; } - #endif } +void srslte_vec_sub_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) { + int i = 0; +#ifdef SRSLTE_SIMD_S_SIZE + if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) { + for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { + simd_s_t a = srslte_simd_s_load(&x[i]); + simd_s_t b = srslte_simd_s_load(&y[i]); + simd_s_t r = srslte_simd_s_sub(a, b); + srslte_simd_s_store(&z[i], r); + } + } else { + for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { + simd_s_t a = srslte_simd_s_loadu(&x[i]); + simd_s_t b = srslte_simd_s_loadu(&y[i]); -void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len) -{ -#ifdef LV_HAVE_SSE - unsigned int number = 0; - const unsigned int points = len / 8; - - const __m128i* xPtr = (const __m128i*) x; - const __m128i* yPtr = (const __m128i*) y; - __m128i* zPtr = (__m128i*) z; - - __m128i xVal, yVal, zVal; - for(;number < points; number++){ - - xVal = _mm_load_si128(xPtr); - yVal = _mm_load_si128(yPtr); - - zVal = _mm_mullo_epi16(xVal, yVal); - - _mm_store_si128(zPtr, zVal); + simd_s_t r = srslte_simd_s_sub(a, b); - xPtr ++; - yPtr ++; - zPtr ++; + srslte_simd_s_storeu(&z[i], r); + } } +#endif /* SRSLTE_SIMD_S_SIZE */ - number = points * 8; - for(;number < len; number++){ - z[number] = x[number] * y[number]; + for(; i < len; i++){ + z[i] = x[i] - y[i]; } -#endif } -void srslte_vec_prod_sss_avx2(short *x, short *y, short *z, uint32_t len) -{ -#ifdef LV_HAVE_AVX2 - unsigned int number = 0; - const unsigned int points = len / 16; - - const __m256i* xPtr = (const __m256i*) x; - const __m256i* yPtr = (const __m256i*) y; - __m256i* zPtr = (__m256i*) z; +void srslte_vec_prod_sss_simd(int16_t *x, int16_t *y, int16_t *z, int len) { + int i = 0; +#ifdef SRSLTE_SIMD_S_SIZE + if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) { + for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { + simd_s_t a = srslte_simd_s_load(&x[i]); + simd_s_t b = srslte_simd_s_load(&y[i]); - __m256i xVal, yVal, zVal; - for(;number < points; number++){ - - xVal = _mm256_loadu_si256(xPtr); - yVal = _mm256_loadu_si256(yPtr); + simd_s_t r = srslte_simd_s_mul(a, b); - zVal = _mm256_mullo_epi16(xVal, yVal); + srslte_simd_s_store(&z[i], r); + } + } else { + for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) { + simd_s_t a = srslte_simd_s_loadu(&x[i]); + simd_s_t b = srslte_simd_s_loadu(&y[i]); - _mm256_storeu_si256(zPtr, zVal); + simd_s_t r = srslte_simd_s_mul(a, b); - xPtr ++; - yPtr ++; - zPtr ++; + srslte_simd_s_storeu(&z[i], r); + } } +#endif /* SRSLTE_SIMD_S_SIZE */ - number = points * 16; - for(;number < len; number++){ - z[number] = x[number] * y[number]; + for(; i < len; i++){ + z[i] = x[i] * y[i]; } -#endif } - - +#warning remove function if it is not used +/* void srslte_vec_sc_div2_sss_sse(short *x, int k, short *z, uint32_t len) { #ifdef LV_HAVE_SSE @@ -357,8 +195,10 @@ void srslte_vec_sc_div2_sss_sse(short *x, int k, short *z, uint32_t len) z[number] = x[number] / divn; } #endif -} +}*/ +#warning remove function if it is not used +/* void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len) { #ifdef LV_HAVE_AVX2 @@ -387,7 +227,7 @@ void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len) z[number] = x[number] / divn; } #endif -} +}*/ @@ -531,379 +371,527 @@ void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) { #endif } -void srslte_vec_sub_fff_sse(float *x, float *y, float *z, uint32_t len) { -#ifdef LV_HAVE_SSE - unsigned int number = 0; - const unsigned int points = len / 4; +cf_t srslte_vec_acc_cc_simd(cf_t *x, int len) { + int i = 0; + cf_t acc_sum = 0.0f; - const float* xPtr = (const float*) x; - const float* yPtr = (const float*) y; - float* zPtr = (float*) z; +#if SRSLTE_SIMD_F_SIZE + simd_f_t simd_sum = srslte_simd_f_zero(); - __m128 xVal, yVal, zVal; - for(;number < points; number++){ + if (SRSLTE_IS_ALIGNED(x)) { + for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) { + simd_f_t a = srslte_simd_f_load((float *) &x[i]); - xVal = _mm_loadu_ps(xPtr); - yVal = _mm_loadu_ps(yPtr); + simd_sum = srslte_simd_f_add(simd_sum, a); + } + } else { + for (; i < len - SRSLTE_SIMD_F_SIZE / 2 + 1; i += SRSLTE_SIMD_F_SIZE / 2) { + simd_f_t a = srslte_simd_f_loadu((float *) &x[i]); - zVal = _mm_sub_ps(xVal, yVal); + simd_sum = srslte_simd_f_add(simd_sum, a); + } + } - _mm_storeu_ps(zPtr, zVal); + __attribute__((aligned(64))) cf_t sum[SRSLTE_SIMD_F_SIZE/2]; + srslte_simd_f_store((float*)&sum, simd_sum); + for (int k = 0; k < SRSLTE_SIMD_F_SIZE/2; k++) { + acc_sum += sum[k]; + } +#endif - xPtr += 4; - yPtr += 4; - zPtr += 4; + for (; i