/** * * \section COPYRIGHT * * Copyright 2013-2015 Software Radio Systems Limited * * \section LICENSE * * This file is part of the srsLTE library. * * srsLTE is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * srsLTE is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * A copy of the GNU Affero General Public License can be found in * the LICENSE file in the top-level directory of this distribution * and at http://www.gnu.org/licenses/. * */ #include #include #include #include #include #include "srslte/utils/vector_simd.h" #include #include #ifdef LV_HAVE_SSE #include #endif #ifdef LV_HAVE_AVX #include #endif int srslte_vec_dot_prod_sss_sse(short *x, short *y, uint32_t len) { int result = 0; #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; const __m128i* xPtr = (const __m128i*) x; const __m128i* yPtr = (const __m128i*) y; __m128i dotProdVal = _mm_setzero_si128(); __m128i xVal, yVal, zVal; for(;number < points; number++){ xVal = _mm_load_si128(xPtr); yVal = _mm_loadu_si128(yPtr); zVal = _mm_mullo_epi16(xVal, yVal); dotProdVal = _mm_add_epi16(dotProdVal, zVal); xPtr ++; yPtr ++; } short dotProdVector[8]; _mm_store_si128((__m128i*) dotProdVector, dotProdVal); for (int i=0;i<8;i++) { result += dotProdVector[i]; } number = points * 8; for(;number < len; number++){ result += (x[number] * y[number]); } #endif return result; } int srslte_vec_dot_prod_sss_avx(short *x, short *y, uint32_t len) { int result = 0; #ifdef LV_HAVE_AVX unsigned int number = 0; const unsigned int points = len / 16; const __m256i* xPtr = (const __m256i*) x; const __m256i* yPtr = (const __m256*) y; __m256i dotProdVal = _mm256_setzero_si256(); __m256i xVal, yVal, zVal; for(;number < points; number++){ xVal = _mm256_load_si256(xPtr); yVal = _mm256_loadu_si256(yPtr); zVal = _mm256_mullo_epi16(xVal, yVal); dotProdVal = _mm256_add_epi16(dotProdVal, zVal); xPtr ++; yPtr ++; } short dotProdVector[16]; _mm256_store_si256((__m256i*) dotProdVector, dotProdVal); for (int i=0;i<16;i++) { result += dotProdVector[i]; } number = points * 16; for(;number < len; number++){ result += (x[number] * y[number]); } #endif return result; } void srslte_vec_sum_sss_sse(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; const __m128i* xPtr = (const __m128i*) x; const __m128i* yPtr = (const __m128i*) y; __m128i* zPtr = (__m128i*) z; __m128i xVal, yVal, zVal; for(;number < points; number++){ xVal = _mm_load_si128(xPtr); yVal = _mm_load_si128(yPtr); zVal = _mm_add_epi16(xVal, yVal); _mm_store_si128(zPtr, zVal); xPtr ++; yPtr ++; zPtr ++; } number = points * 8; for(;number < len; number++){ z[number] = x[number] + y[number]; } #endif } void srslte_vec_sum_sss_avx(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 16; const __m256i* xPtr = (const __m256i*) x; const __m256i* yPtr = (const __m256i*) y; __m256i* zPtr = (__m256i*) z; __m256i xVal, yVal, zVal; for(;number < points; number++){ xVal = _mm256_load_si256(xPtr); yVal = _mm256_loadu_si256(yPtr); zVal = _mm256_add_epi16(xVal, yVal); _mm256_store_si256(zPtr, zVal); xPtr ++; yPtr ++; zPtr ++; } number = points * 16; for(;number < len; number++){ z[number] = x[number] + y[number]; } #endif } void srslte_vec_sub_sss_sse(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; const __m128i* xPtr = (const __m128i*) x; const __m128i* yPtr = (const __m128i*) y; __m128i* zPtr = (__m128i*) z; __m128i xVal, yVal, zVal; for(;number < points; number++){ xVal = _mm_load_si128(xPtr); yVal = _mm_load_si128(yPtr); zVal = _mm_sub_epi16(xVal, yVal); _mm_store_si128(zPtr, zVal); xPtr ++; yPtr ++; zPtr ++; } number = points * 8; for(;number < len; number++){ z[number] = x[number] - y[number]; } #endif } void srslte_vec_sub_sss_avx(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_AVX unsigned int number = 0; const unsigned int points = len / 16; const __m256i* xPtr = (const __m256i*) x; const __m256i* yPtr = (const __m256i*) y; __m256i* zPtr = (__m256i*) z; __m256i xVal, yVal, zVal; for(;number < points; number++){ xVal = _mm256_load_si256(xPtr); yVal = _mm256_loadu_si256(yPtr); zVal = _mm256_sub_epi16(xVal, yVal); _mm256_store_si256(zPtr, zVal); xPtr ++; yPtr ++; zPtr ++; } number = points * 16; for(;number < len; number++){ z[number] = x[number] - y[number]; } #endif } void srslte_vec_prod_sss_sse(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; const __m128i* xPtr = (const __m128i*) x; const __m128i* yPtr = (const __m128i*) y; __m128i* zPtr = (__m128i*) z; __m128i xVal, yVal, zVal; for(;number < points; number++){ xVal = _mm_load_si128(xPtr); yVal = _mm_load_si128(yPtr); zVal = _mm_mullo_epi16(xVal, yVal); _mm_store_si128(zPtr, zVal); xPtr ++; yPtr ++; zPtr ++; } number = points * 8; for(;number < len; number++){ z[number] = x[number] * y[number]; } #endif } void srslte_vec_prod_sss_avx(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 16; const __m256i* xPtr = (const __m256i*) x; const __m256i* yPtr = (const __m256i*) y; __m256i* zPtr = (__m256i*) z; __m256i xVal, yVal, zVal; for(;number < points; number++){ xVal = _mm256_load_si256(xPtr); yVal = _mm256_loadu_si256(yPtr); zVal = _mm256_mullo_epi16(xVal, yVal); _mm256_store_si256(zPtr, zVal); xPtr ++; yPtr ++; zPtr ++; } number = points * 16; for(;number < len; number++){ z[number] = x[number] * y[number]; } #endif } void srslte_vec_sc_div2_sss_sse(short *x, int k, short *z, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; const __m128i* xPtr = (const __m128i*) x; __m128i* zPtr = (__m128i*) z; __m128i xVal, zVal; for(;number < points; number++){ xVal = _mm_load_si128(xPtr); zVal = _mm_srai_epi16(xVal, k); _mm_store_si128(zPtr, zVal); xPtr ++; zPtr ++; } number = points * 8; short divn = (1<