|
|
|
@ -232,143 +232,113 @@ void srslte_vec_sc_div2_sss_avx2(short *x, int k, short *z, uint32_t len)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* No improvement with AVX */
|
|
|
|
|
void srslte_vec_lut_sss_sse(short *x, unsigned short *lut, short *y, uint32_t len)
|
|
|
|
|
{
|
|
|
|
|
#ifdef DEBUG_MODE
|
|
|
|
|
for (int i=0;i<len;i++) {
|
|
|
|
|
y[lut[i]] = x[i];
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
void srslte_vec_lut_sss_simd(short *x, unsigned short *lut, short *y, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
#ifdef LV_HAVE_SSE
|
|
|
|
|
unsigned int number = 0;
|
|
|
|
|
const unsigned int points = len / 8;
|
|
|
|
|
|
|
|
|
|
const __m128i* xPtr = (const __m128i*) x;
|
|
|
|
|
const __m128i* lutPtr = (__m128i*) lut;
|
|
|
|
|
#if CMAKE_BUILD_TYPE!=Debug
|
|
|
|
|
|
|
|
|
|
__m128i xVal, lutVal;
|
|
|
|
|
for(;number < points; number++){
|
|
|
|
|
|
|
|
|
|
xVal = _mm_loadu_si128(xPtr);
|
|
|
|
|
lutVal = _mm_loadu_si128(lutPtr);
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(lut)) {
|
|
|
|
|
for (; i < len - 7; i += 8) {
|
|
|
|
|
__m128i xVal = _mm_load_si128((__m128i *) &x[i]);
|
|
|
|
|
__m128i lutVal = _mm_load_si128((__m128i *) &lut[i]);
|
|
|
|
|
|
|
|
|
|
for (int i=0;i<8;i++) {
|
|
|
|
|
int16_t x = (int16_t) _mm_extract_epi16(xVal, i);
|
|
|
|
|
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, i);
|
|
|
|
|
y[l] = x;
|
|
|
|
|
for (int k = 0; k < 8; k++) {
|
|
|
|
|
int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
|
|
|
|
|
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
|
|
|
|
|
y[l] = (short) x;
|
|
|
|
|
}
|
|
|
|
|
xPtr ++;
|
|
|
|
|
lutPtr ++;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - 7; i += 8) {
|
|
|
|
|
__m128i xVal = _mm_loadu_si128((__m128i *) &x[i]);
|
|
|
|
|
__m128i lutVal = _mm_loadu_si128((__m128i *) &lut[i]);
|
|
|
|
|
|
|
|
|
|
number = points * 8;
|
|
|
|
|
for(;number < len; number++){
|
|
|
|
|
y[lut[number]] = x[number];
|
|
|
|
|
for (int k = 0; k < 8; k++) {
|
|
|
|
|
int16_t x = (int16_t) _mm_extract_epi16(xVal, k);
|
|
|
|
|
uint16_t l = (uint16_t) _mm_extract_epi16(lutVal, k);
|
|
|
|
|
y[l] = (short) x;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
for (; i < len; i++) {
|
|
|
|
|
y[lut[i]] = x[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */
|
|
|
|
|
void srslte_vec_convert_fi_sse(float *x, int16_t *z, float scale, uint32_t len)
|
|
|
|
|
{
|
|
|
|
|
#ifdef LV_HAVE_SSE
|
|
|
|
|
unsigned int number = 0;
|
|
|
|
|
|
|
|
|
|
const unsigned int eighthPoints = len / 8;
|
|
|
|
|
void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
|
const float* inputVectorPtr = (const float*)x;
|
|
|
|
|
int16_t* outputVectorPtr = z;
|
|
|
|
|
#if SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE
|
|
|
|
|
simd_f_t s = srslte_simd_f_set1(scale);
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(z)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
|
|
|
|
|
simd_f_t a = srslte_simd_f_load(&x[i]);
|
|
|
|
|
simd_f_t b = srslte_simd_f_load(&x[i + SRSLTE_SIMD_F_SIZE]);
|
|
|
|
|
|
|
|
|
|
__m128 vScalar = _mm_set_ps1(scale);
|
|
|
|
|
__m128 inputVal1, inputVal2;
|
|
|
|
|
__m128i intInputVal1, intInputVal2;
|
|
|
|
|
__m128 ret1, ret2;
|
|
|
|
|
simd_f_t sa = srslte_simd_f_mul(a, s);
|
|
|
|
|
simd_f_t sb = srslte_simd_f_mul(b, s);
|
|
|
|
|
|
|
|
|
|
for(;number < eighthPoints; number++){
|
|
|
|
|
inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
|
|
|
|
|
inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
|
|
|
|
|
simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);
|
|
|
|
|
|
|
|
|
|
ret1 = _mm_mul_ps(inputVal1, vScalar);
|
|
|
|
|
ret2 = _mm_mul_ps(inputVal2, vScalar);
|
|
|
|
|
srslte_simd_s_store(&z[i], i16);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_S_SIZE + 1; i += SRSLTE_SIMD_S_SIZE) {
|
|
|
|
|
simd_f_t a = srslte_simd_f_loadu(&x[i]);
|
|
|
|
|
simd_f_t b = srslte_simd_f_loadu(&x[i + SRSLTE_SIMD_F_SIZE]);
|
|
|
|
|
|
|
|
|
|
intInputVal1 = _mm_cvtps_epi32(ret1);
|
|
|
|
|
intInputVal2 = _mm_cvtps_epi32(ret2);
|
|
|
|
|
simd_f_t sa = srslte_simd_f_mul(a, s);
|
|
|
|
|
simd_f_t sb = srslte_simd_f_mul(b, s);
|
|
|
|
|
|
|
|
|
|
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
|
|
|
|
simd_s_t i16 = srslte_simd_convert_2f_s(sa, sb);
|
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
|
|
|
|
outputVectorPtr += 8;
|
|
|
|
|
srslte_simd_s_storeu(&z[i], i16);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif /* SRSLTE_SIMD_F_SIZE && SRSLTE_SIMD_S_SIZE */
|
|
|
|
|
|
|
|
|
|
number = eighthPoints * 8;
|
|
|
|
|
for(; number < len; number++){
|
|
|
|
|
z[number] = (int16_t) (x[number] * scale);
|
|
|
|
|
for(; i < len; i++){
|
|
|
|
|
z[i] = (int16_t) (x[i] * scale);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
float srslte_vec_acc_ff_simd(float *x, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
float acc_sum = 0.0f;
|
|
|
|
|
|
|
|
|
|
// for enb no-volk
|
|
|
|
|
void srslte_vec_sum_fff_sse(float *x, float *y, float *z, uint32_t len) {
|
|
|
|
|
#ifdef LV_HAVE_SSE
|
|
|
|
|
unsigned int number = 0;
|
|
|
|
|
const unsigned int points = len / 4;
|
|
|
|
|
|
|
|
|
|
const float* xPtr = (const float*) x;
|
|
|
|
|
const float* yPtr = (const float*) y;
|
|
|
|
|
float* zPtr = (float*) z;
|
|
|
|
|
|
|
|
|
|
__m128 xVal, yVal, zVal;
|
|
|
|
|
for(;number < points; number++){
|
|
|
|
|
|
|
|
|
|
xVal = _mm_loadu_ps(xPtr);
|
|
|
|
|
yVal = _mm_loadu_ps(yPtr);
|
|
|
|
|
|
|
|
|
|
zVal = _mm_add_ps(xVal, yVal);
|
|
|
|
|
#if SRSLTE_SIMD_F_SIZE
|
|
|
|
|
simd_f_t simd_sum = srslte_simd_f_zero();
|
|
|
|
|
|
|
|
|
|
_mm_storeu_ps(zPtr, zVal);
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(x)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
|
|
|
|
|
simd_f_t a = srslte_simd_f_load(&x[i]);
|
|
|
|
|
|
|
|
|
|
xPtr += 4;
|
|
|
|
|
yPtr += 4;
|
|
|
|
|
zPtr += 4;
|
|
|
|
|
simd_sum = srslte_simd_f_add(simd_sum, a);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
|
|
|
|
|
simd_f_t a = srslte_simd_f_loadu(&x[i]);
|
|
|
|
|
|
|
|
|
|
number = points * 4;
|
|
|
|
|
for(;number < len; number++){
|
|
|
|
|
z[number] = x[number] + y[number];
|
|
|
|
|
simd_sum = srslte_simd_f_add(simd_sum, a);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void srslte_vec_sum_fff_avx(float *x, float *y, float *z, uint32_t len) {
|
|
|
|
|
#ifdef LV_HAVE_AVX
|
|
|
|
|
unsigned int number = 0;
|
|
|
|
|
const unsigned int points = len / 8;
|
|
|
|
|
|
|
|
|
|
const float* xPtr = (const float*) x;
|
|
|
|
|
const float* yPtr = (const float*) y;
|
|
|
|
|
float* zPtr = (float*) z;
|
|
|
|
|
|
|
|
|
|
__m256 xVal, yVal, zVal;
|
|
|
|
|
for(;number < points; number++){
|
|
|
|
|
|
|
|
|
|
xVal = _mm256_loadu_ps(xPtr);
|
|
|
|
|
yVal = _mm256_loadu_ps(yPtr);
|
|
|
|
|
|
|
|
|
|
zVal = _mm256_add_ps(xVal, yVal);
|
|
|
|
|
|
|
|
|
|
_mm256_storeu_ps(zPtr, zVal);
|
|
|
|
|
|
|
|
|
|
xPtr += 8;
|
|
|
|
|
yPtr += 8;
|
|
|
|
|
zPtr += 8;
|
|
|
|
|
__attribute__((aligned(SRSLTE_SIMD_F_SIZE*4))) float sum[SRSLTE_SIMD_F_SIZE];
|
|
|
|
|
srslte_simd_f_store(sum, simd_sum);
|
|
|
|
|
for (int k = 0; k < SRSLTE_SIMD_F_SIZE; k++) {
|
|
|
|
|
acc_sum += sum[k];
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
for(number = points * 8;number < len; number++){
|
|
|
|
|
z[number] = x[number] + y[number];
|
|
|
|
|
for (; i<len; i++) {
|
|
|
|
|
acc_sum += x[i];
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return acc_sum;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cf_t srslte_vec_acc_cc_simd(cf_t *x, int len) {
|
|
|
|
@ -570,6 +540,34 @@ cf_t srslte_vec_dot_prod_conj_ccc_simd(cf_t *x, cf_t *y, int len)
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void srslte_vec_prod_cfc_simd(cf_t *x, float *y, cf_t *z, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
|
#if SRSLTE_SIMD_CF_SIZE
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
|
|
|
|
|
simd_f_t s = srslte_simd_f_load(&y[i]);
|
|
|
|
|
|
|
|
|
|
simd_cf_t a = srslte_simd_cfi_load(&x[i]);
|
|
|
|
|
simd_cf_t r = srslte_simd_cf_mul(a, s);
|
|
|
|
|
srslte_simd_cfi_store(&z[i], r);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
|
|
|
|
|
simd_f_t s = srslte_simd_f_loadu(&y[i]);
|
|
|
|
|
|
|
|
|
|
simd_cf_t a = srslte_simd_cfi_load(&x[i]);
|
|
|
|
|
simd_cf_t r = srslte_simd_cf_mul(a, s);
|
|
|
|
|
srslte_simd_cfi_storeu(&z[i], r);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
for (; i<len; i++) {
|
|
|
|
|
z[i] = x[i] * y[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void srslte_vec_prod_fff_simd(float *x, float *y, float *z, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
@ -630,10 +628,12 @@ void srslte_vec_prod_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
|
|
|
|
|
void srslte_vec_prod_ccc_split_simd(float *a_re, float *a_im, float *b_re, float *b_im, float *r_re, float *r_im, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
|
#if SRSLTE_SIMD_F_SIZE
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
|
|
|
|
|
SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
|
|
|
|
|
simd_cf_t a = srslte_simd_cf_load(&a_re[i], &a_im[i]);
|
|
|
|
|
simd_cf_t b = srslte_simd_cf_load(&b_re[i], &b_im[i]);
|
|
|
|
@ -642,6 +642,16 @@ void srslte_vec_prod_ccc_cf_simd(float *a_re, float *a_im, float *b_re, float *b
|
|
|
|
|
|
|
|
|
|
srslte_simd_cf_store(&r_re[i], &r_im[i], r);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
|
|
|
|
|
simd_cf_t a = srslte_simd_cf_loadu(&a_re[i], &a_im[i]);
|
|
|
|
|
simd_cf_t b = srslte_simd_cf_loadu(&b_re[i], &b_im[i]);
|
|
|
|
|
|
|
|
|
|
simd_cf_t r = srslte_simd_cf_prod(a, b);
|
|
|
|
|
|
|
|
|
|
srslte_simd_cf_storeu(&r_re[i], &r_im[i], r);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
for (; i<len; i++) {
|
|
|
|
@ -655,6 +665,8 @@ void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, i
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
|
#if SRSLTE_SIMD_C16_SIZE
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(a_re) && SRSLTE_IS_ALIGNED(a_im) && SRSLTE_IS_ALIGNED(b_re) && SRSLTE_IS_ALIGNED(b_im) &&
|
|
|
|
|
SRSLTE_IS_ALIGNED(r_re) && SRSLTE_IS_ALIGNED(r_im)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
|
|
|
|
|
simd_c16_t a = srslte_simd_c16_load(&a_re[i], &a_im[i]);
|
|
|
|
|
simd_c16_t b = srslte_simd_c16_load(&b_re[i], &b_im[i]);
|
|
|
|
@ -663,6 +675,16 @@ void srslte_vec_prod_ccc_c16_simd(int16_t *a_re, int16_t *a_im, int16_t *b_re, i
|
|
|
|
|
|
|
|
|
|
srslte_simd_c16_store(&r_re[i], &r_im[i], r);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_C16_SIZE + 1; i += SRSLTE_SIMD_C16_SIZE) {
|
|
|
|
|
simd_c16_t a = srslte_simd_c16_loadu(&a_re[i], &a_im[i]);
|
|
|
|
|
simd_c16_t b = srslte_simd_c16_loadu(&b_re[i], &b_im[i]);
|
|
|
|
|
|
|
|
|
|
simd_c16_t r = srslte_simd_c16_prod(a, b);
|
|
|
|
|
|
|
|
|
|
srslte_simd_c16_storeu(&r_re[i], &r_im[i], r);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
for (; i<len; i++) {
|
|
|
|
@ -701,6 +723,103 @@ void srslte_vec_prod_conj_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void srslte_vec_div_ccc_simd(cf_t *x,cf_t *y, cf_t *z, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
|
#if SRSLTE_SIMD_CF_SIZE
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
|
|
|
|
|
simd_cf_t a = srslte_simd_cfi_load(&x[i]);
|
|
|
|
|
simd_cf_t b = srslte_simd_cfi_load(&y[i]);
|
|
|
|
|
|
|
|
|
|
simd_cf_t rcpb = srslte_simd_cf_rcp(b);
|
|
|
|
|
simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
|
|
|
|
|
|
|
|
|
|
srslte_simd_cfi_store(&z[i], r);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
|
|
|
|
|
simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
|
|
|
|
|
simd_cf_t b = srslte_simd_cfi_loadu(&y[i]);
|
|
|
|
|
|
|
|
|
|
simd_cf_t rcpb = srslte_simd_cf_rcp(b);
|
|
|
|
|
simd_cf_t r = srslte_simd_cf_prod(a, rcpb);
|
|
|
|
|
|
|
|
|
|
srslte_simd_cfi_storeu(&z[i], r);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
for (; i < len; i++) {
|
|
|
|
|
z[i] = x[i] / y[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void srslte_vec_div_cfc_simd(cf_t *x,float *y, cf_t *z, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
|
#if SRSLTE_SIMD_CF_SIZE && SRSLTE_SIMD_CF_SIZE == SRSLTE_SIMD_F_SIZE
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
|
|
|
|
|
simd_cf_t a = srslte_simd_cfi_load(&x[i]);
|
|
|
|
|
simd_f_t b = srslte_simd_f_load(&y[i]);
|
|
|
|
|
|
|
|
|
|
simd_f_t rcpb = srslte_simd_f_rcp(b);
|
|
|
|
|
simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
|
|
|
|
|
|
|
|
|
|
srslte_simd_cfi_store(&z[i], r);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_CF_SIZE + 1; i += SRSLTE_SIMD_CF_SIZE) {
|
|
|
|
|
simd_cf_t a = srslte_simd_cfi_loadu(&x[i]);
|
|
|
|
|
simd_f_t b = srslte_simd_f_loadu(&y[i]);
|
|
|
|
|
|
|
|
|
|
simd_f_t rcpb = srslte_simd_f_rcp(b);
|
|
|
|
|
simd_cf_t r = srslte_simd_cf_mul(a, rcpb);
|
|
|
|
|
|
|
|
|
|
srslte_simd_cfi_storeu(&z[i], r);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
for (; i < len; i++) {
|
|
|
|
|
z[i] = x[i] / y[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void srslte_vec_div_fff_simd(float *x, float *y, float *z, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
|
#if SRSLTE_SIMD_F_SIZE
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(x) && SRSLTE_IS_ALIGNED(y) && SRSLTE_IS_ALIGNED(z)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
|
|
|
|
|
simd_f_t a = srslte_simd_f_load(&x[i]);
|
|
|
|
|
simd_f_t b = srslte_simd_f_load(&y[i]);
|
|
|
|
|
|
|
|
|
|
simd_f_t rcpb = srslte_simd_f_rcp(b);
|
|
|
|
|
simd_f_t r = srslte_simd_f_mul(a, rcpb);
|
|
|
|
|
|
|
|
|
|
srslte_simd_f_store(&z[i], r);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_F_SIZE + 1; i += SRSLTE_SIMD_F_SIZE) {
|
|
|
|
|
simd_f_t a = srslte_simd_f_loadu(&x[i]);
|
|
|
|
|
simd_f_t b = srslte_simd_f_loadu(&y[i]);
|
|
|
|
|
|
|
|
|
|
simd_f_t rcpb = srslte_simd_f_rcp(b);
|
|
|
|
|
simd_f_t r = srslte_simd_f_mul(a, rcpb);
|
|
|
|
|
|
|
|
|
|
srslte_simd_f_storeu(&z[i], r);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
for (; i < len; i++) {
|
|
|
|
|
z[i] = x[i] / y[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void srslte_vec_sc_prod_ccc_simd(cf_t *x, cf_t h, cf_t *z, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
@ -895,3 +1014,137 @@ void srslte_vec_cp_simd(cf_t *src, cf_t *dst, int len) {
|
|
|
|
|
dst[i] = src[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t srslte_vec_max_fi_simd(float *x, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
|
float max_value = -INFINITY;
|
|
|
|
|
uint32_t max_index = 0;
|
|
|
|
|
|
|
|
|
|
#if SRSLTE_SIMD_I_SIZE
|
|
|
|
|
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
|
|
|
|
|
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
|
|
|
|
|
|
|
|
|
|
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
|
|
|
|
|
simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
|
|
|
|
|
simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
|
|
|
|
|
simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
|
|
|
|
|
|
|
|
|
|
simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
|
|
|
|
|
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(x)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
|
|
|
|
|
simd_f_t a = srslte_simd_f_load(&x[i]);
|
|
|
|
|
|
|
|
|
|
simd_i_t res = srslte_simd_f_max(a, simd_max_values);
|
|
|
|
|
|
|
|
|
|
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
|
|
|
|
|
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
|
|
|
|
|
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
|
|
|
|
|
simd_f_t a = srslte_simd_f_loadu(&x[i]);
|
|
|
|
|
|
|
|
|
|
simd_i_t res = srslte_simd_f_max(a, simd_max_values);
|
|
|
|
|
|
|
|
|
|
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
|
|
|
|
|
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) a, res);
|
|
|
|
|
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
srslte_simd_i_store(indexes_buffer, simd_max_indexes);
|
|
|
|
|
srslte_simd_f_store(values_buffer, simd_max_values);
|
|
|
|
|
|
|
|
|
|
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
|
|
|
|
|
if (values_buffer[k] > max_value) {
|
|
|
|
|
max_value = values_buffer[k];
|
|
|
|
|
max_index = (uint32_t) indexes_buffer[k];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif /* SRSLTE_SIMD_I_SIZE */
|
|
|
|
|
|
|
|
|
|
for (; i < len; i++) {
|
|
|
|
|
if (x[i] > max_value) {
|
|
|
|
|
max_value = x[i];
|
|
|
|
|
max_index = (uint32_t)i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return max_index;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t srslte_vec_max_ci_simd(cf_t *x, int len) {
|
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
|
|
float max_value = -INFINITY;
|
|
|
|
|
uint32_t max_index = 0;
|
|
|
|
|
|
|
|
|
|
#if SRSLTE_SIMD_I_SIZE
|
|
|
|
|
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(int)))) int indexes_buffer[SRSLTE_SIMD_I_SIZE] = {0};
|
|
|
|
|
__attribute__ ((aligned (SRSLTE_SIMD_I_SIZE*sizeof(float)))) float values_buffer[SRSLTE_SIMD_I_SIZE] = {0};
|
|
|
|
|
|
|
|
|
|
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) indexes_buffer[k] = k;
|
|
|
|
|
simd_i_t simd_inc = srslte_simd_i_set1(SRSLTE_SIMD_I_SIZE);
|
|
|
|
|
simd_i_t simd_indexes = srslte_simd_i_load(indexes_buffer);
|
|
|
|
|
simd_i_t simd_max_indexes = srslte_simd_i_set1(0);
|
|
|
|
|
|
|
|
|
|
simd_f_t simd_max_values = srslte_simd_f_set1(-INFINITY);
|
|
|
|
|
|
|
|
|
|
if (SRSLTE_IS_ALIGNED(x)) {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
|
|
|
|
|
simd_f_t x1 = srslte_simd_f_load((float *) &x[i]);
|
|
|
|
|
simd_f_t x2 = srslte_simd_f_load((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
|
|
|
|
|
|
|
|
|
|
simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
|
|
|
|
|
simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
|
|
|
|
|
|
|
|
|
|
simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
|
|
|
|
|
|
|
|
|
|
simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
|
|
|
|
|
|
|
|
|
|
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
|
|
|
|
|
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
|
|
|
|
|
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (; i < len - SRSLTE_SIMD_I_SIZE + 1; i += SRSLTE_SIMD_I_SIZE) {
|
|
|
|
|
simd_f_t x1 = srslte_simd_f_loadu((float *) &x[i]);
|
|
|
|
|
simd_f_t x2 = srslte_simd_f_loadu((float *) &x[i + SRSLTE_SIMD_F_SIZE / 2]);
|
|
|
|
|
|
|
|
|
|
simd_f_t mul1 = srslte_simd_f_mul(x1, x1);
|
|
|
|
|
simd_f_t mul2 = srslte_simd_f_mul(x2, x2);
|
|
|
|
|
|
|
|
|
|
simd_f_t z1 = srslte_simd_f_hadd(mul1, mul2);
|
|
|
|
|
|
|
|
|
|
simd_i_t res = srslte_simd_f_max(z1, simd_max_values);
|
|
|
|
|
|
|
|
|
|
simd_max_indexes = srslte_simd_i_select(simd_max_indexes, simd_indexes, res);
|
|
|
|
|
simd_max_values = (simd_f_t) srslte_simd_i_select((simd_i_t) simd_max_values, (simd_i_t) z1, res);
|
|
|
|
|
simd_indexes = srslte_simd_i_add(simd_indexes, simd_inc);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
srslte_simd_i_store(indexes_buffer, simd_max_indexes);
|
|
|
|
|
srslte_simd_f_store(values_buffer, simd_max_values);
|
|
|
|
|
|
|
|
|
|
for (int k = 0; k < SRSLTE_SIMD_I_SIZE; k++) {
|
|
|
|
|
if (values_buffer[k] > max_value) {
|
|
|
|
|
max_value = values_buffer[k];
|
|
|
|
|
max_index = (uint32_t) indexes_buffer[k];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif /* SRSLTE_SIMD_I_SIZE */
|
|
|
|
|
|
|
|
|
|
for (; i < len; i++) {
|
|
|
|
|
cf_t a = x[i];
|
|
|
|
|
float abs2 = __real__ a * __real__ a + __imag__ a * __imag__ a;
|
|
|
|
|
if (abs2 > max_value) {
|
|
|
|
|
max_value = abs2;
|
|
|
|
|
max_index = (uint32_t)i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return max_index;
|
|
|
|
|
}
|
|
|
|
|