adding support for neon soft demodulation, adding cmake checks for aarch64

* adding support for neon soft demodulation, adding cmake checks for aarch64
* rearranging some arm cmake flags
master
Justin Tallon 5 years ago committed by Andre Puschmann
parent e196939041
commit 4a1b8a5952

@ -85,8 +85,12 @@ option(ENABLE_TIDY "Enable clang tidy" OFF)
option(USE_LTE_RATES "Use standard LTE sampling rates" OFF) option(USE_LTE_RATES "Use standard LTE sampling rates" OFF)
option(USE_GLIBC_IPV6 "Use glibc's own ipv6.h" ON) option(USE_GLIBC_IPV6 "Use glibc's own ipv6.h" ON)
set(GCC_ARCH native CACHE STRING "GCC compile for specific architecture.") if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
set(GCC_ARCH armv8-a CACHE STRING "GCC compile for specific architecture.")
message(STATUS "Detected aarch64 processor")
else(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
set(GCC_ARCH native CACHE STRING "GCC compile for specific architecture.")
endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
######################################################################## ########################################################################
# Find dependencies # Find dependencies
@ -360,14 +364,23 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
endif(HAVE_SSE) endif(HAVE_SSE)
endif(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug") endif(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch") if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIS_ARM -DHAVE_NEON -mfloat-abi=hard -mfpu=neon") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIS_ARM -DHAVE_NEON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_ARM -mfloat-abi=hard -mfpu=neon") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_ARM")
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
set(HAVE_NEONv8 "True")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_NEONv8")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_NEONv8")
endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard -mfpu=neon")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard -mfpu=neon")
endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
message(STATUS "Detected ARM processor") message(STATUS "Detected ARM processor")
set(HAVE_NEON "True") set(HAVE_NEON "True")
else(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch") else(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
set(HAVE_NEON "False") set(HAVE_NEON "False")
endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch") endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
set(CMAKE_REQUIRED_FLAGS ${CMAKE_C_FLAGS}) set(CMAKE_REQUIRED_FLAGS ${CMAKE_C_FLAGS})
if(NOT HAVE_SSE AND NOT HAVE_NEON AND NOT DISABLE_SIMD) if(NOT HAVE_SSE AND NOT HAVE_NEON AND NOT DISABLE_SIMD)

@ -29,6 +29,7 @@
#include <stdio.h> #include <stdio.h>
#include <sys/auxv.h> #include <sys/auxv.h>
#include <asm/hwcap.h> #include <asm/hwcap.h>
#define USER_HWCAP_NEON (1 << 12)
#else #else
#include <cpuid.h> #include <cpuid.h>
#define X86_CPUID_BASIC_LEAF 1 #define X86_CPUID_BASIC_LEAF 1
@ -86,7 +87,11 @@ const char* x86_get_isa()
#ifdef IS_ARM #ifdef IS_ARM
const char* arm_get_isa() const char* arm_get_isa()
{ {
#ifdef HAVE_NEONv8
if (getauxval(AT_HWCAP) & USER_HWCAP_NEON) {
#else
if (getauxval(AT_HWCAP) & HWCAP_NEON) { if (getauxval(AT_HWCAP) & HWCAP_NEON) {
#endif
return "neon"; return "neon";
} else { } else {
return "generic"; return "generic";

@ -28,6 +28,56 @@
#include "srslte/phy/utils/debug.h" #include "srslte/phy/utils/debug.h"
#include "srslte/phy/utils/vector.h" #include "srslte/phy/utils/vector.h"
#ifdef HAVE_NEONv8
#include <arm_neon.h>
inline static uint8x16_t v_load_s8(int i15, int i14, int i13, int i12, int i11,
int i10, int i9, int i8, int i7, int i6,
int i5, int i4, int i3, int i2, int i1,
int i0) {
uint8_t __attribute__((aligned(16))) data[16] = {
i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15};
return vld1q_u8(data);
}
#define int8x16_to_8x8x2(v) ((int8x8x2_t){{vget_low_s8(v), vget_high_s8(v)}})
inline static void vshuff_s32_even(int32x4_t a, int imm, int32x4_t *res) {
*res = vsetq_lane_s32(vgetq_lane_s32((a), ((imm) >> 2) & 0x3), *res, 1);
*res = vsetq_lane_s32(vgetq_lane_s32((a), ((imm) >> 6) & 0x3), *res, 3);
}
inline static void vshuff_s32_odd(int32x4_t a, int imm, int32x4_t *res) {
*res = vsetq_lane_s32(vgetq_lane_s32((a), (imm)&0x3), *res, 0);
*res = vsetq_lane_s32(vgetq_lane_s32((a), ((imm) >> 4) & 0x3), *res, 2);
}
inline static void vshuff_s32_idx(int32x4_t a, int imm, int32x4_t *res,
int idx) {
*res =
vsetq_lane_s32(vgetq_lane_s32((a), ((imm) >> idx * 2) & 0x3), *res, idx);
}
inline static void vshuff_s16_idx(int16x8_t a, int imm, int16x8_t *res,
int idx) {
*res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> (idx * 4)) & 0xF), *res,
idx);
}
inline static void vshuff_s16_even(int16x8_t a, int imm, int16x8_t *res) {
*res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 4) & 0xF), *res, 1);
*res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 12) & 0xF), *res, 3);
*res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 20) & 0xF), *res, 5);
*res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 28) & 0xF), *res, 7);
}
inline static void vshuff_s16_odd(int16x8_t a, int imm, int16x8_t *res) {
*res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm)) & 0xF), *res, 0);
*res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 8) & 0xF), *res, 2);
*res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 16) & 0xF), *res, 4);
*res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 24) & 0xF), *res, 6);
}
#endif
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#include <smmintrin.h> #include <smmintrin.h>
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols); void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols);
@ -85,6 +135,118 @@ void demod_16qam_lte(const cf_t *symbols, float *llr, int nsymbols) {
} }
} }
#ifdef HAVE_NEONv8
void demod_16qam_lte_s_neon(const cf_t *symbols, short *llr, int nsymbols) {
float *symbolsPtr = (float *)symbols;
int16x8_t *resultPtr = (int16x8_t *)llr;
float32x4_t symbol1, symbol2;
int32x4_t symbol_i1, symbol_i2;
int16x8_t symbol_i, symbol_abs;
int8x16_t result11, result21;
result11 = vdupq_n_s8(0);
result21 = vdupq_n_s8(0);
int16x8_t offset = vdupq_n_s16(2 * SCALE_SHORT_CONV_QAM16 / sqrt(10));
float32x4_t scale_v = vdupq_n_f32(-SCALE_SHORT_CONV_QAM16);
for (int i = 0; i < nsymbols / 4; i++) {
symbol1 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol2 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol_i1 = vcvtnq_s32_f32(vmulq_f32(symbol1, scale_v));
symbol_i2 = vcvtnq_s32_f32(vmulq_f32(symbol2, scale_v));
symbol_i = vcombine_s16(vqmovn_s32(symbol_i1), vqmovn_s32(symbol_i2));
symbol_abs = vqabsq_s16(symbol_i);
symbol_abs = vsubq_s16(symbol_abs, offset);
vshuff_s32_odd((int32x4_t)symbol_i, 16, (int32x4_t *)&result11);
vshuff_s32_even((int32x4_t)symbol_abs, 64, (int32x4_t *)&result11);
vshuff_s32_odd((int32x4_t)symbol_i, 50, (int32x4_t *)&result21);
vshuff_s32_even((int32x4_t)symbol_abs, 200, (int32x4_t *)&result21);
vst1q_s8((int8_t *)resultPtr, result11);
resultPtr++;
vst1q_s8((int8_t *)resultPtr, result21);
resultPtr++;
}
// Demodulate last symbols
for (int i = 4 * (nsymbols / 4); i < nsymbols; i++) {
short yre = (short)(SCALE_SHORT_CONV_QAM16 * crealf(symbols[i]));
short yim = (short)(SCALE_SHORT_CONV_QAM16 * cimagf(symbols[i]));
llr[4 * i + 0] = -yre;
llr[4 * i + 1] = -yim;
llr[4 * i + 2] = abs(yre) - 2 * SCALE_SHORT_CONV_QAM16 / sqrt(10);
llr[4 * i + 3] = abs(yim) - 2 * SCALE_SHORT_CONV_QAM16 / sqrt(10);
}
}
void demod_16qam_lte_b_neon(const cf_t *symbols, int8_t *llr, int nsymbols) {
float *symbolsPtr = (float *)symbols;
int8x16_t *resultPtr = (int8x16_t *)llr;
float32x4_t symbol1, symbol2, symbol3, symbol4;
int8x16_t symbol_i, symbol_abs;
int16x8_t symbol_12, symbol_34;
int32x4_t symbol_i1, symbol_i2, symbol_i3, symbol_i4;
int8x16_t offset = vdupq_n_s8(2 * SCALE_BYTE_CONV_QAM16 / sqrt(10));
int8x16_t result1n, result2n;
float32x4_t scale_v = vdupq_n_f32(-SCALE_BYTE_CONV_QAM16);
result1n = vdupq_n_s8(0);
result2n = vdupq_n_s8(0);
for (int i = 0; i < nsymbols / 8; i++) {
symbol1 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol2 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol3 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol4 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol_i1 = vcvtnq_s32_f32(vmulq_f32(symbol1, scale_v));
symbol_i2 = vcvtnq_s32_f32(vmulq_f32(symbol2, scale_v));
symbol_i3 = vcvtnq_s32_f32(vmulq_f32(symbol3, scale_v));
symbol_i4 = vcvtnq_s32_f32(vmulq_f32(symbol4, scale_v));
symbol_12 =
(int16x8_t)vcombine_s16(vqmovn_s32(symbol_i1), vqmovn_s32(symbol_i2));
symbol_34 =
(int16x8_t)vcombine_s16(vqmovn_s32(symbol_i3), vqmovn_s32(symbol_i4));
symbol_i =
(int8x16_t)vcombine_s8(vqmovn_s16(symbol_12), vqmovn_s16(symbol_34));
symbol_abs = vqabsq_s8(symbol_i);
symbol_abs = vsubq_s8(symbol_abs, offset);
vshuff_s16_odd((int16x8_t)symbol_i, 0x3020100, (int16x8_t *)&result1n);
vshuff_s16_even((int16x8_t)symbol_abs, 0x30201000, (int16x8_t *)&result1n);
vshuff_s16_odd((int16x8_t)symbol_i, 0x07060504, (int16x8_t *)&result2n);
vshuff_s16_even((int16x8_t)symbol_abs, 0x70605040, (int16x8_t *)&result2n);
vst1q_s8((int8_t *)resultPtr, result1n);
resultPtr++;
vst1q_s8((int8_t *)resultPtr, result2n);
resultPtr++;
}
// Demodulate last symbols
for (int i = 8 * (nsymbols / 8); i < nsymbols; i++) {
short yre = (int8_t)(SCALE_BYTE_CONV_QAM16 * crealf(symbols[i]));
short yim = (int8_t)(SCALE_BYTE_CONV_QAM16 * cimagf(symbols[i]));
llr[4 * i + 0] = -yre;
llr[4 * i + 1] = -yim;
llr[4 * i + 2] = abs(yre) - 2 * SCALE_BYTE_CONV_QAM16 / sqrt(10);
llr[4 * i + 3] = abs(yim) - 2 * SCALE_BYTE_CONV_QAM16 / sqrt(10);
}
}
#endif
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) { void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) {
@ -110,12 +272,12 @@ void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) {
symbol_abs = _mm_abs_epi16(symbol_i); symbol_abs = _mm_abs_epi16(symbol_i);
symbol_abs = _mm_sub_epi16(symbol_abs, offset); symbol_abs = _mm_sub_epi16(symbol_abs, offset);
result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);
result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);
_mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++;
_mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++;
@ -190,6 +352,9 @@ void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) {
void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) { void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
demod_16qam_lte_s_sse(symbols, llr, nsymbols); demod_16qam_lte_s_sse(symbols, llr, nsymbols);
#else
#ifdef HAVE_NEONv8
demod_16qam_lte_s_neon(symbols, llr, nsymbols);
#else #else
for (int i=0;i<nsymbols;i++) { for (int i=0;i<nsymbols;i++) {
short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i])); short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
@ -201,11 +366,15 @@ void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) {
llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10); llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
} }
#endif #endif
#endif
} }
void demod_16qam_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols) { void demod_16qam_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols) {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
demod_16qam_lte_b_sse(symbols, llr, nsymbols); demod_16qam_lte_b_sse(symbols, llr, nsymbols);
#else
#ifdef HAVE_NEONv8
demod_16qam_lte_b_neon(symbols, llr, nsymbols);
#else #else
for (int i=0;i<nsymbols;i++) { for (int i=0;i<nsymbols;i++) {
int8_t yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i])); int8_t yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i]));
@ -217,6 +386,7 @@ void demod_16qam_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols) {
llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10); llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
} }
#endif #endif
#endif
} }
void demod_64qam_lte(const cf_t *symbols, float *llr, int nsymbols) void demod_64qam_lte(const cf_t *symbols, float *llr, int nsymbols)
@ -234,6 +404,171 @@ void demod_64qam_lte(const cf_t *symbols, float *llr, int nsymbols)
} }
} }
#ifdef HAVE_NEONv8
void demod_64qam_lte_s_neon(const cf_t *symbols, short *llr, int nsymbols) {
float *symbolsPtr = (float *)symbols;
uint16x8_t *resultPtr = (uint16x8_t *)llr;
float32x4_t symbol1, symbol2;
int16x8_t symbol_i, symbol_abs, symbol_abs2;
int32x4_t symbol_i1, symbol_i2;
int16x8_t offset1 = vdupq_n_s16(4 * SCALE_SHORT_CONV_QAM64 / sqrt(42));
int16x8_t offset2 = vdupq_n_s16(2 * SCALE_SHORT_CONV_QAM64 / sqrt(42));
float32x4_t scale_v = vdupq_n_f32(-SCALE_SHORT_CONV_QAM64);
int16x8_t result11 = vdupq_n_s16(0);
int16x8_t result21 = vdupq_n_s16(0);
int16x8_t result31 = vdupq_n_s16(0);
for (int i = 0; i < nsymbols / 4; i++) {
symbol1 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol2 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol_i1 = vcvtnq_s32_f32(vmulq_f32(symbol1, scale_v));
symbol_i2 = vcvtnq_s32_f32(vmulq_f32(symbol2, scale_v));
symbol_i = vcombine_s16(vqmovn_s32(symbol_i1), vqmovn_s32(symbol_i2));
symbol_abs = vqabsq_s16(symbol_i);
symbol_abs = vsubq_s16(symbol_abs, offset1);
symbol_abs2 = vsubq_s16(vqabsq_s16(symbol_abs), offset2);
vshuff_s32_idx((int32x4_t)symbol_i, 64, (int32x4_t *)&result11, 0);
vshuff_s32_idx((int32x4_t)symbol_abs, 64, (int32x4_t *)&result11, 1);
vshuff_s32_idx((int32x4_t)symbol_abs2, 64, (int32x4_t *)&result11, 2);
vshuff_s32_idx((int32x4_t)symbol_i, 64, (int32x4_t *)&result11, 3);
vshuff_s32_idx((int32x4_t)symbol_abs, 165, (int32x4_t *)&result21, 0);
vshuff_s32_idx((int32x4_t)symbol_abs2, 165, (int32x4_t *)&result21, 1);
vshuff_s32_idx((int32x4_t)symbol_i, 165, (int32x4_t *)&result21, 2);
vshuff_s32_idx((int32x4_t)symbol_abs, 165, (int32x4_t *)&result21, 3);
vshuff_s32_idx((int32x4_t)symbol_abs2, 254, (int32x4_t *)&result31, 0);
vshuff_s32_idx((int32x4_t)symbol_i, 254, (int32x4_t *)&result31, 1);
vshuff_s32_idx((int32x4_t)symbol_abs, 254, (int32x4_t *)&result31, 2);
vshuff_s32_idx((int32x4_t)symbol_abs2, 254, (int32x4_t *)&result31, 3);
vst1q_s16((int16_t *)resultPtr, result11);
resultPtr++;
vst1q_s16((int16_t *)resultPtr, result21);
resultPtr++;
vst1q_s16((int16_t *)resultPtr, result31);
resultPtr++;
}
for (int i = 4 * (nsymbols / 4); i < nsymbols; i++) {
float yre = (short)(SCALE_SHORT_CONV_QAM64 * crealf(symbols[i]));
float yim = (short)(SCALE_SHORT_CONV_QAM64 * cimagf(symbols[i]));
llr[6 * i + 0] = -yre;
llr[6 * i + 1] = -yim;
llr[6 * i + 2] = abs(yre) - 4 * SCALE_SHORT_CONV_QAM64 / sqrt(42);
llr[6 * i + 3] = abs(yim) - 4 * SCALE_SHORT_CONV_QAM64 / sqrt(42);
llr[6 * i + 4] =
abs(llr[6 * i + 2]) - 2 * SCALE_SHORT_CONV_QAM64 / sqrt(42);
llr[6 * i + 5] =
abs(llr[6 * i + 3]) - 2 * SCALE_SHORT_CONV_QAM64 / sqrt(42);
}
}
void demod_64qam_lte_b_neon(const cf_t *symbols, int8_t *llr, int nsymbols) {
float *symbolsPtr = (float *)symbols;
uint8x16_t *resultPtr = (uint8x16_t *)llr;
float32x4_t symbol1, symbol2, symbol3, symbol4;
int8x16_t symbol_i, symbol_abs, symbol_abs2;
int16x8_t symbol_12, symbol_34;
int32x4_t symbol_i1, symbol_i2, symbol_i3, symbol_i4;
int8x16_t offset1 = vdupq_n_s8(4 * SCALE_BYTE_CONV_QAM64 / sqrt(42));
int8x16_t offset2 = vdupq_n_s8(2 * SCALE_BYTE_CONV_QAM64 / sqrt(42));
float32x4_t scale_v = vdupq_n_f32(-SCALE_BYTE_CONV_QAM64);
int8x16_t result11 = vdupq_n_s8(0);
int8x16_t result21 = vdupq_n_s8(0);
int8x16_t result31 = vdupq_n_s8(0);
for (int i = 0; i < nsymbols / 8; i++) {
symbol1 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol2 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol3 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol4 = vld1q_f32(symbolsPtr);
symbolsPtr += 4;
symbol_i1 = vcvtnq_s32_f32(vmulq_f32(symbol1, scale_v));
symbol_i2 = vcvtnq_s32_f32(vmulq_f32(symbol2, scale_v));
symbol_i3 = vcvtnq_s32_f32(vmulq_f32(symbol3, scale_v));
symbol_i4 = vcvtnq_s32_f32(vmulq_f32(symbol4, scale_v));
symbol_12 = vcombine_s16(vqmovn_s32(symbol_i1), vqmovn_s32(symbol_i2));
symbol_34 = vcombine_s16(vqmovn_s32(symbol_i3), vqmovn_s32(symbol_i4));
symbol_i = vcombine_s8(vqmovn_s16(symbol_12), vqmovn_s16(symbol_34));
symbol_abs = vqabsq_s8(symbol_i);
symbol_abs = vsubq_s8(symbol_abs, offset1);
symbol_abs2 = vsubq_s8(vqabsq_s8(symbol_abs), offset2);
vshuff_s16_idx((int16x8_t)symbol_i, 0x22111000, (int16x8_t *)&result11, 0);
vshuff_s16_idx((int16x8_t)symbol_abs, 0x22111000, (int16x8_t *)&result11,
1);
vshuff_s16_idx((int16x8_t)symbol_abs2, 0x22111000, (int16x8_t *)&result11,
2);
vshuff_s16_idx((int16x8_t)symbol_i, 0x22111000, (int16x8_t *)&result11, 3);
vshuff_s16_idx((int16x8_t)symbol_abs, 0x22111000, (int16x8_t *)&result11,
4);
vshuff_s16_idx((int16x8_t)symbol_abs2, 0x22111000, (int16x8_t *)&result11,
5);
vshuff_s16_idx((int16x8_t)symbol_i, 0x22111000, (int16x8_t *)&result11, 6);
vshuff_s16_idx((int16x8_t)symbol_abs, 0x22111000, (int16x8_t *)&result11,
7);
vshuff_s16_idx((int16x8_t)symbol_abs2, 0x54443332, (int16x8_t *)&result21,
0);
vshuff_s16_idx((int16x8_t)symbol_i, 0x54443332, (int16x8_t *)&result21, 1);
vshuff_s16_idx((int16x8_t)symbol_abs, 0x54443332, (int16x8_t *)&result21,
2);
vshuff_s16_idx((int16x8_t)symbol_abs2, 0x54443332, (int16x8_t *)&result21,
3);
vshuff_s16_idx((int16x8_t)symbol_i, 0x54443332, (int16x8_t *)&result21, 4);
vshuff_s16_idx((int16x8_t)symbol_abs, 0x54443332, (int16x8_t *)&result21,
5);
vshuff_s16_idx((int16x8_t)symbol_abs2, 0x54443332, (int16x8_t *)&result21,
6);
vshuff_s16_idx((int16x8_t)symbol_i, 0x54443332, (int16x8_t *)&result21, 7);
vshuff_s16_idx((int16x8_t)symbol_abs, 0x77766655, (int16x8_t *)&result31,
0);
vshuff_s16_idx((int16x8_t)symbol_abs2, 0x77766655, (int16x8_t *)&result31,
1);
vshuff_s16_idx((int16x8_t)symbol_i, 0x77766655, (int16x8_t *)&result31, 2);
vshuff_s16_idx((int16x8_t)symbol_abs, 0x77766655, (int16x8_t *)&result31,
3);
vshuff_s16_idx((int16x8_t)symbol_abs2, 0x77766655, (int16x8_t *)&result31,
4);
vshuff_s16_idx((int16x8_t)symbol_i, 0x77766655, (int16x8_t *)&result31, 5);
vshuff_s16_idx((int16x8_t)symbol_abs, 0x77766655, (int16x8_t *)&result31,
6);
vshuff_s16_idx((int16x8_t)symbol_abs2, 0x77766655, (int16x8_t *)&result31,
7);
vst1q_s8((int8_t *)resultPtr, result11);
resultPtr++;
vst1q_s8((int8_t *)resultPtr, result21);
resultPtr++;
vst1q_s8((int8_t *)resultPtr, result31);
resultPtr++;
}
for (int i = 8 * (nsymbols / 8); i < nsymbols; i++) {
float yre = (int8_t)(SCALE_BYTE_CONV_QAM64 * crealf(symbols[i]));
float yim = (int8_t)(SCALE_BYTE_CONV_QAM64 * cimagf(symbols[i]));
llr[6 * i + 0] = -yre;
llr[6 * i + 1] = -yim;
llr[6 * i + 2] = abs(yre) - 4 * SCALE_BYTE_CONV_QAM64 / sqrt(42);
llr[6 * i + 3] = abs(yim) - 4 * SCALE_BYTE_CONV_QAM64 / sqrt(42);
llr[6 * i + 4] = abs(llr[6 * i + 2]) - 2 * SCALE_BYTE_CONV_QAM64 / sqrt(42);
llr[6 * i + 5] = abs(llr[6 * i + 3]) - 2 * SCALE_BYTE_CONV_QAM64 / sqrt(42);
}
}
#endif
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
@ -355,7 +690,6 @@ void demod_64qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols)
_mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++;
_mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++;
_mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++;
} }
for (int i=8*(nsymbols/8);i<nsymbols;i++) { for (int i=8*(nsymbols/8);i<nsymbols;i++) {
float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i])); float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i]));
@ -376,6 +710,9 @@ void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols)
{ {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
demod_64qam_lte_s_sse(symbols, llr, nsymbols); demod_64qam_lte_s_sse(symbols, llr, nsymbols);
#else
#ifdef HAVE_NEONv8
demod_64qam_lte_s_neon(symbols, llr, nsymbols);
#else #else
for (int i=0;i<nsymbols;i++) { for (int i=0;i<nsymbols;i++) {
float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i])); float yre = (short) (SCALE_SHORT_CONV_QAM64*crealf(symbols[i]));
@ -389,12 +726,16 @@ void demod_64qam_lte_s(const cf_t *symbols, short *llr, int nsymbols)
llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV_QAM64/sqrt(42); llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_SHORT_CONV_QAM64/sqrt(42);
} }
#endif #endif
#endif
} }
void demod_64qam_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols) void demod_64qam_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols)
{ {
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
demod_64qam_lte_b_sse(symbols, llr, nsymbols); demod_64qam_lte_b_sse(symbols, llr, nsymbols);
#else
#ifdef HAVE_NEONv8
demod_64qam_lte_b_neon(symbols, llr, nsymbols);
#else #else
for (int i=0;i<nsymbols;i++) { for (int i=0;i<nsymbols;i++) {
float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i])); float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i]));
@ -408,6 +749,7 @@ void demod_64qam_lte_b(const cf_t *symbols, int8_t *llr, int nsymbols)
llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
} }
#endif #endif
#endif
} }
void demod_256qam_lte(const cf_t* symbols, float* llr, int nsymbols) void demod_256qam_lte(const cf_t* symbols, float* llr, int nsymbols)

Loading…
Cancel
Save