|
|
@ -1551,22 +1551,19 @@ static inline simd_s_t srslte_simd_s_neg(simd_s_t a, simd_s_t b)
|
|
|
|
/* Taken and modified from sse2neon.h licensed under MIT
|
|
|
|
/* Taken and modified from sse2neon.h licensed under MIT
|
|
|
|
* Source: https://github.com/DLTcollab/sse2neon
|
|
|
|
* Source: https://github.com/DLTcollab/sse2neon
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
int16x8_t _a = vreinterpretq_s16_s32(a);
|
|
|
|
|
|
|
|
int16x8_t _b = vreinterpretq_s16_s32(b);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int16x8_t zero = vdupq_n_s16(0);
|
|
|
|
int16x8_t zero = vdupq_n_s16(0);
|
|
|
|
// signed shift right: faster than vclt
|
|
|
|
// signed shift right: faster than vclt
|
|
|
|
// (b < 0) ? 0xFFFF : 0
|
|
|
|
// (b < 0) ? 0xFFFF : 0
|
|
|
|
uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(_b, 15));
|
|
|
|
uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
|
|
|
|
// (b == 0) ? 0xFFFF : 0
|
|
|
|
// (b == 0) ? 0xFFFF : 0
|
|
|
|
int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(_b, zero));
|
|
|
|
int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, zero));
|
|
|
|
// -a
|
|
|
|
// -a
|
|
|
|
int16x8_t neg = vnegq_s16(_a);
|
|
|
|
int16x8_t neg = vnegq_s16(a);
|
|
|
|
// bitwise select either a or neg based on ltMask
|
|
|
|
// bitwise select either a or neg based on ltMask
|
|
|
|
int16x8_t masked = vbslq_s16(ltMask, _a, neg);
|
|
|
|
int16x8_t masked = vbslq_s16(ltMask, a, neg);
|
|
|
|
// res = masked & (~zeroMask)
|
|
|
|
// res = masked & (~zeroMask)
|
|
|
|
int16x8_t res = vbicq_s16(masked, zeroMask);
|
|
|
|
int16x8_t res = vbicq_s16(masked, zeroMask);
|
|
|
|
return vreinterpretq_s32_s16(res);
|
|
|
|
return res;
|
|
|
|
#endif /* HAVE_NEON */
|
|
|
|
#endif /* HAVE_NEON */
|
|
|
|
#endif /* LV_HAVE_SSE */
|
|
|
|
#endif /* LV_HAVE_SSE */
|
|
|
|
#endif /* LV_HAVE_AVX2 */
|
|
|
|
#endif /* LV_HAVE_AVX2 */
|
|
|
@ -2068,22 +2065,19 @@ static inline simd_s_t srslte_simd_b_neg(simd_b_t a, simd_b_t b)
|
|
|
|
/* Taken and modified from sse2neon.h licensed under MIT
|
|
|
|
/* Taken and modified from sse2neon.h licensed under MIT
|
|
|
|
* Source: https://github.com/DLTcollab/sse2neon
|
|
|
|
* Source: https://github.com/DLTcollab/sse2neon
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
int8x16_t _a = vreinterpretq_s8_s64(a);
|
|
|
|
|
|
|
|
int8x16_t _b = vreinterpretq_s8_s64(b);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int8x16_t zero = vdupq_n_s8(0);
|
|
|
|
int8x16_t zero = vdupq_n_s8(0);
|
|
|
|
// signed shift right: faster than vclt
|
|
|
|
// signed shift right: faster than vclt
|
|
|
|
// (b < 0) ? 0xFF : 0
|
|
|
|
// (b < 0) ? 0xFF : 0
|
|
|
|
uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(_b, 7));
|
|
|
|
uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
|
|
|
|
// (b == 0) ? 0xFF : 0
|
|
|
|
// (b == 0) ? 0xFF : 0
|
|
|
|
int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(_b, zero));
|
|
|
|
int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, zero));
|
|
|
|
// -a
|
|
|
|
// -a
|
|
|
|
int8x16_t neg = vnegq_s8(_a);
|
|
|
|
int8x16_t neg = vnegq_s8(a);
|
|
|
|
// bitwise select either a or neg based on ltMask
|
|
|
|
// bitwise select either a or neg based on ltMask
|
|
|
|
int8x16_t masked = vbslq_s8(ltMask, _a, neg);
|
|
|
|
int8x16_t masked = vbslq_s8(ltMask, a, neg);
|
|
|
|
// res = masked & (~zeroMask)
|
|
|
|
// res = masked & (~zeroMask)
|
|
|
|
int8x16_t res = vbicq_s8(masked, zeroMask);
|
|
|
|
int8x16_t res = vbicq_s8(masked, zeroMask);
|
|
|
|
return vreinterpretq_s64_s8(res);
|
|
|
|
return res;
|
|
|
|
#endif /* HAVE_NEON */
|
|
|
|
#endif /* HAVE_NEON */
|
|
|
|
#endif /* LV_HAVE_SSE */
|
|
|
|
#endif /* LV_HAVE_SSE */
|
|
|
|
#endif /* LV_HAVE_AVX2 */
|
|
|
|
#endif /* LV_HAVE_AVX2 */
|
|
|
|