From 4a1b8a595261a6f180a9dcb4a8a83a9af3278bf3 Mon Sep 17 00:00:00 2001 From: Justin Tallon Date: Wed, 6 Nov 2019 12:11:26 +0100 Subject: [PATCH] adding support for neon soft demodulation, adding cmake checks for aarch64 * adding support for neon soft demodulation, adding cmake checks for aarch64 * rearranging some arm cmake flags --- CMakeLists.txt | 27 ++- lib/src/common/arch_select.cc | 5 + lib/src/phy/modem/demod_soft.c | 350 ++++++++++++++++++++++++++++++++- 3 files changed, 371 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 23e27d754..24a6ca965 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,8 +85,12 @@ option(ENABLE_TIDY "Enable clang tidy" OFF) option(USE_LTE_RATES "Use standard LTE sampling rates" OFF) option(USE_GLIBC_IPV6 "Use glibc's own ipv6.h" ON) -set(GCC_ARCH native CACHE STRING "GCC compile for specific architecture.") - +if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + set(GCC_ARCH armv8-a CACHE STRING "GCC compile for specific architecture.") + message(STATUS "Detected aarch64 processor") +else(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + set(GCC_ARCH native CACHE STRING "GCC compile for specific architecture.") +endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") ######################################################################## # Find dependencies @@ -360,14 +364,23 @@ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") endif(HAVE_SSE) endif(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug") - if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIS_ARM -DHAVE_NEON -mfloat-abi=hard -mfpu=neon") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_ARM -mfloat-abi=hard -mfpu=neon") + if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIS_ARM -DHAVE_NEON") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_ARM") + if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + set(HAVE_NEONv8 "True") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_NEONv8") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_NEONv8") + endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard -mfpu=neon") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard -mfpu=neon") + endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") message(STATUS "Detected ARM processor") set(HAVE_NEON "True") - else(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch") + else(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") set(HAVE_NEON "False") - endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch") + endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") set(CMAKE_REQUIRED_FLAGS ${CMAKE_C_FLAGS}) if(NOT HAVE_SSE AND NOT HAVE_NEON AND NOT DISABLE_SIMD) diff --git a/lib/src/common/arch_select.cc b/lib/src/common/arch_select.cc index 148dff8f9..d4c5e9e38 100644 --- a/lib/src/common/arch_select.cc +++ b/lib/src/common/arch_select.cc @@ -29,6 +29,7 @@ #include #include #include +#define USER_HWCAP_NEON (1 << 12) #else #include #define X86_CPUID_BASIC_LEAF 1 @@ -86,7 +87,11 @@ const char* x86_get_isa() #ifdef IS_ARM const char* arm_get_isa() { +#ifdef HAVE_NEONv8 + if (getauxval(AT_HWCAP) & USER_HWCAP_NEON) { +#else if (getauxval(AT_HWCAP) & HWCAP_NEON) { +#endif return "neon"; } else { return "generic"; diff --git a/lib/src/phy/modem/demod_soft.c b/lib/src/phy/modem/demod_soft.c index f63f6f9e3..b2269ca46 100644 --- a/lib/src/phy/modem/demod_soft.c +++ b/lib/src/phy/modem/demod_soft.c @@ -28,6 +28,56 @@ #include "srslte/phy/utils/debug.h" #include "srslte/phy/utils/vector.h" +#ifdef HAVE_NEONv8 +#include +inline static uint8x16_t v_load_s8(int i15, int i14, int i13, int i12, int i11, + int i10, int i9, int i8, int i7, int i6, + int i5, int i4, int i3, int i2, int i1, + int i0) { + uint8_t __attribute__((aligned(16))) data[16] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; + return vld1q_u8(data); +} + +#define int8x16_to_8x8x2(v) ((int8x8x2_t){{vget_low_s8(v), vget_high_s8(v)}}) + +inline static void vshuff_s32_even(int32x4_t a, int imm, int32x4_t *res) { + *res = vsetq_lane_s32(vgetq_lane_s32((a), ((imm) >> 2) & 0x3), *res, 1); + *res = vsetq_lane_s32(vgetq_lane_s32((a), ((imm) >> 6) & 0x3), *res, 3); +} +inline static void vshuff_s32_odd(int32x4_t a, int imm, int32x4_t *res) { + *res = vsetq_lane_s32(vgetq_lane_s32((a), (imm)&0x3), *res, 0); + *res = vsetq_lane_s32(vgetq_lane_s32((a), ((imm) >> 4) & 0x3), *res, 2); +} + +inline static void vshuff_s32_idx(int32x4_t a, int imm, int32x4_t *res, + int idx) { + *res = + vsetq_lane_s32(vgetq_lane_s32((a), ((imm) >> idx * 2) & 0x3), *res, idx); +} + +inline static void vshuff_s16_idx(int16x8_t a, int imm, int16x8_t *res, + int idx) { + *res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> (idx * 4)) & 0xF), *res, + idx); +} + +inline static void vshuff_s16_even(int16x8_t a, int imm, int16x8_t *res) { + *res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 4) & 0xF), *res, 1); + *res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 12) & 0xF), *res, 3); + *res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 20) & 0xF), *res, 5); + *res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 28) & 0xF), *res, 7); +} + +inline static void vshuff_s16_odd(int16x8_t a, int imm, int16x8_t *res) { + *res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm)) & 0xF), *res, 0); + *res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 8) & 0xF), *res, 2); + *res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 16) & 0xF), *res, 4); + *res = vsetq_lane_s16(vgetq_lane_s16((a), ((imm) >> 24) & 0xF), *res, 6); +} + +#endif + #ifdef LV_HAVE_SSE #include void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols); @@ -85,6 +135,118 @@ void demod_16qam_lte(const cf_t *symbols, float *llr, int nsymbols) { } } +#ifdef HAVE_NEONv8 + +void demod_16qam_lte_s_neon(const cf_t *symbols, short *llr, int nsymbols) { + float *symbolsPtr = (float *)symbols; + int16x8_t *resultPtr = (int16x8_t *)llr; + float32x4_t symbol1, symbol2; + int32x4_t symbol_i1, symbol_i2; + int16x8_t symbol_i, symbol_abs; + int8x16_t result11, result21; + result11 = vdupq_n_s8(0); + result21 = vdupq_n_s8(0); + int16x8_t offset = vdupq_n_s16(2 * SCALE_SHORT_CONV_QAM16 / sqrt(10)); + float32x4_t scale_v = vdupq_n_f32(-SCALE_SHORT_CONV_QAM16); + + for (int i = 0; i < nsymbols / 4; i++) { + symbol1 = vld1q_f32(symbolsPtr); + symbolsPtr += 4; + symbol2 = vld1q_f32(symbolsPtr); + symbolsPtr += 4; + + symbol_i1 = vcvtnq_s32_f32(vmulq_f32(symbol1, scale_v)); + symbol_i2 = vcvtnq_s32_f32(vmulq_f32(symbol2, scale_v)); + symbol_i = vcombine_s16(vqmovn_s32(symbol_i1), vqmovn_s32(symbol_i2)); + + symbol_abs = vqabsq_s16(symbol_i); + symbol_abs = vsubq_s16(symbol_abs, offset); + + vshuff_s32_odd((int32x4_t)symbol_i, 16, (int32x4_t *)&result11); + vshuff_s32_even((int32x4_t)symbol_abs, 64, (int32x4_t *)&result11); + + vshuff_s32_odd((int32x4_t)symbol_i, 50, (int32x4_t *)&result21); + vshuff_s32_even((int32x4_t)symbol_abs, 200, (int32x4_t *)&result21); + + vst1q_s8((int8_t *)resultPtr, result11); + resultPtr++; + vst1q_s8((int8_t *)resultPtr, result21); + resultPtr++; + } + // Demodulate last symbols + for (int i = 4 * (nsymbols / 4); i < nsymbols; i++) { + short yre = (short)(SCALE_SHORT_CONV_QAM16 * crealf(symbols[i])); + short yim = (short)(SCALE_SHORT_CONV_QAM16 * cimagf(symbols[i])); + + llr[4 * i + 0] = -yre; + llr[4 * i + 1] = -yim; + llr[4 * i + 2] = abs(yre) - 2 * SCALE_SHORT_CONV_QAM16 / sqrt(10); + llr[4 * i + 3] = abs(yim) - 2 * SCALE_SHORT_CONV_QAM16 / sqrt(10); + } +} + +void demod_16qam_lte_b_neon(const cf_t *symbols, int8_t *llr, int nsymbols) { + float *symbolsPtr = (float *)symbols; + int8x16_t *resultPtr = (int8x16_t *)llr; + float32x4_t symbol1, symbol2, symbol3, symbol4; + int8x16_t symbol_i, symbol_abs; + int16x8_t symbol_12, symbol_34; + int32x4_t symbol_i1, symbol_i2, symbol_i3, symbol_i4; + int8x16_t offset = vdupq_n_s8(2 * SCALE_BYTE_CONV_QAM16 / sqrt(10)); + int8x16_t result1n, result2n; + float32x4_t scale_v = vdupq_n_f32(-SCALE_BYTE_CONV_QAM16); + + result1n = vdupq_n_s8(0); + result2n = vdupq_n_s8(0); + for (int i = 0; i < nsymbols / 8; i++) { + + symbol1 = vld1q_f32(symbolsPtr); + symbolsPtr += 4; + symbol2 = vld1q_f32(symbolsPtr); + symbolsPtr += 4; + symbol3 = vld1q_f32(symbolsPtr); + symbolsPtr += 4; + symbol4 = vld1q_f32(symbolsPtr); + symbolsPtr += 4; + symbol_i1 = vcvtnq_s32_f32(vmulq_f32(symbol1, scale_v)); + symbol_i2 = vcvtnq_s32_f32(vmulq_f32(symbol2, scale_v)); + symbol_i3 = vcvtnq_s32_f32(vmulq_f32(symbol3, scale_v)); + symbol_i4 = vcvtnq_s32_f32(vmulq_f32(symbol4, scale_v)); + + symbol_12 = + (int16x8_t)vcombine_s16(vqmovn_s32(symbol_i1), vqmovn_s32(symbol_i2)); + symbol_34 = + (int16x8_t)vcombine_s16(vqmovn_s32(symbol_i3), vqmovn_s32(symbol_i4)); + symbol_i = + (int8x16_t)vcombine_s8(vqmovn_s16(symbol_12), vqmovn_s16(symbol_34)); + symbol_abs = vqabsq_s8(symbol_i); + symbol_abs = vsubq_s8(symbol_abs, offset); + + vshuff_s16_odd((int16x8_t)symbol_i, 0x3020100, (int16x8_t *)&result1n); + vshuff_s16_even((int16x8_t)symbol_abs, 0x30201000, (int16x8_t *)&result1n); + + vshuff_s16_odd((int16x8_t)symbol_i, 0x07060504, (int16x8_t *)&result2n); + vshuff_s16_even((int16x8_t)symbol_abs, 0x70605040, (int16x8_t *)&result2n); + + vst1q_s8((int8_t *)resultPtr, result1n); + resultPtr++; + vst1q_s8((int8_t *)resultPtr, result2n); + resultPtr++; + } + // Demodulate last symbols + for (int i = 8 * (nsymbols / 8); i < nsymbols; i++) { + short yre = (int8_t)(SCALE_BYTE_CONV_QAM16 * crealf(symbols[i])); + short yim = (int8_t)(SCALE_BYTE_CONV_QAM16 * cimagf(symbols[i])); + + llr[4 * i + 0] = -yre; + llr[4 * i + 1] = -yim; + llr[4 * i + 2] = abs(yre) - 2 * SCALE_BYTE_CONV_QAM16 / sqrt(10); + llr[4 * i + 3] = abs(yim) - 2 * SCALE_BYTE_CONV_QAM16 / sqrt(10); + } +} + +#endif + #ifdef LV_HAVE_SSE void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) { @@ -110,12 +272,12 @@ void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) { symbol_abs = _mm_abs_epi16(symbol_i); symbol_abs = _mm_sub_epi16(symbol_abs, offset); - + result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); - result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); - result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); + result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); + result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); _mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++; @@ -190,6 +352,9 @@ void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) { void demod_16qam_lte_s(const cf_t *symbols, short *llr, int nsymbols) { #ifdef LV_HAVE_SSE demod_16qam_lte_s_sse(symbols, llr, nsymbols); +#else +#ifdef HAVE_NEONv8 + demod_16qam_lte_s_neon(symbols, llr, nsymbols); #else for (int i=0;i