mirror of https://github.com/pvnis/srsRAN_4G.git
Integrated AVX2 decoder in PDSCH object. Added inter-frame SSE decoder (not working and not integrated)
parent
c1ef9da32a
commit
f00ea8c8ed
@ -0,0 +1,119 @@
|
||||
/**
|
||||
*
|
||||
* \section COPYRIGHT
|
||||
*
|
||||
* Copyright 2013-2015 Software Radio Systems Limited
|
||||
*
|
||||
* \section LICENSE
|
||||
*
|
||||
* This file is part of the srsLTE library.
|
||||
*
|
||||
* srsLTE is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* srsLTE is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* A copy of the GNU Affero General Public License can be found in
|
||||
* the LICENSE file in the top-level directory of this distribution
|
||||
* and at http://www.gnu.org/licenses/.
|
||||
*
|
||||
*/
|
||||
|
||||
/**********************************************************************************************
|
||||
* File: turbodecoder.h
|
||||
*
|
||||
* Description: Turbo Decoder.
|
||||
* Parallel Concatenated Convolutional Code (PCCC) with two 8-state constituent
|
||||
* encoders and one turbo code internal interleaver. The coding rate of turbo
|
||||
* encoder is 1/3.
|
||||
* MAP_GEN is the MAX-LOG-MAP generic implementation of the decoder.
|
||||
*
|
||||
* Reference: 3GPP TS 36.212 version 10.0.0 Release 10 Sec. 5.1.3.2
|
||||
*********************************************************************************************/
|
||||
|
||||
#ifndef TURBODECODER_SSE_INTER_
|
||||
#define TURBODECODER_SSE_INTER_
|
||||
|
||||
|
||||
/** This is an simd inter-frame parallel turbo decoder. Parallizes 8 code-blocks using SSE
|
||||
* This implementation is currently not functional and not used by the rest of the code
|
||||
*/
|
||||
|
||||
#include "srslte/config.h"
|
||||
#include "srslte/phy/fec/tc_interl.h"
|
||||
#include "srslte/phy/fec/cbsegm.h"
|
||||
|
||||
#if LV_HAVE_AVX2
|
||||
#define SRSLTE_TDEC_NPAR 16
|
||||
#else
|
||||
#define SRSLTE_TDEC_NPAR 8
|
||||
#endif
|
||||
|
||||
|
||||
typedef struct SRSLTE_API {
|
||||
int max_long_cb;
|
||||
|
||||
int16_t *syst0;
|
||||
int16_t *parity0;
|
||||
int16_t *syst1;
|
||||
int16_t *parity1;
|
||||
int16_t *llr1;
|
||||
int16_t *llr2;
|
||||
int16_t *w;
|
||||
int16_t *alpha;
|
||||
|
||||
uint32_t max_par_cb;
|
||||
int current_cbidx;
|
||||
uint32_t current_long_cb;
|
||||
srslte_tc_interl_t interleaver[SRSLTE_NOF_TC_CB_SIZES];
|
||||
int n_iter[SRSLTE_TDEC_NPAR];
|
||||
} srslte_tdec_simd_inter_t;
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h,
|
||||
uint32_t max_par_cb,
|
||||
uint32_t max_long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h);
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_get_nof_iterations_cb(srslte_tdec_simd_inter_t * h,
|
||||
uint32_t cb_idx);
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h,
|
||||
uint32_t cb_idx);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h,
|
||||
int16_t * input[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h,
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h,
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h,
|
||||
uint8_t *output,
|
||||
uint32_t cbidx,
|
||||
uint32_t long_cb);
|
||||
|
||||
SRSLTE_API int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h,
|
||||
int16_t *input[SRSLTE_TDEC_NPAR],
|
||||
uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_iterations,
|
||||
uint32_t nof_cb,
|
||||
uint32_t long_cb);
|
||||
|
||||
#endif
|
@ -0,0 +1,299 @@
|
||||
/**
|
||||
*
|
||||
* \section COPYRIGHT
|
||||
*
|
||||
* Copyright 2013-2015 Software Radio Systems Limited
|
||||
*
|
||||
* \section LICENSE
|
||||
*
|
||||
* This file is part of the srsLTE library.
|
||||
*
|
||||
* srsLTE is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* srsLTE is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* A copy of the GNU Affero General Public License can be found in
|
||||
* the LICENSE file in the top-level directory of this distribution
|
||||
* and at http://www.gnu.org/licenses/.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "srslte/phy/fec/turbodecoder_simd_inter.h"
|
||||
#include "srslte/phy/utils/vector.h"
|
||||
|
||||
#define TOTALTAIL 12
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <smmintrin.h>
|
||||
|
||||
void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb);
|
||||
void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb);
|
||||
void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb);
|
||||
void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb);
|
||||
|
||||
|
||||
static void map_sse_inter_dec(srslte_tdec_simd_inter_t * h, int16_t * input, int16_t * parity, int16_t * output,
|
||||
uint32_t long_cb)
|
||||
{
|
||||
map_see_inter_alpha(h, input, parity, long_cb);
|
||||
map_sse_inter_beta(h, input, parity, output, long_cb);
|
||||
}
|
||||
|
||||
/************************************************
|
||||
*
|
||||
* TURBO DECODER INTERFACE
|
||||
*
|
||||
************************************************/
|
||||
int srslte_tdec_simd_inter_init(srslte_tdec_simd_inter_t * h, uint32_t max_par_cb, uint32_t max_long_cb)
|
||||
{
|
||||
int ret = -1;
|
||||
bzero(h, sizeof(srslte_tdec_simd_inter_t));
|
||||
uint32_t len = max_long_cb + 12;
|
||||
|
||||
h->max_long_cb = max_long_cb;
|
||||
h->max_par_cb = max_par_cb;
|
||||
|
||||
h->llr1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->llr1) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->llr2 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->llr2) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->w = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->w) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->syst0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->syst0) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->syst1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->syst1) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->parity0 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->parity0) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->parity1 = srslte_vec_malloc(sizeof(int16_t) * len * h->max_par_cb);
|
||||
if (!h->parity1) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
h->alpha = srslte_vec_malloc(sizeof(int16_t) * 8*(len+12) * h->max_par_cb);
|
||||
if (!h->alpha) {
|
||||
perror("srslte_vec_malloc");
|
||||
goto clean_and_exit;
|
||||
}
|
||||
|
||||
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
|
||||
if (srslte_tc_interl_init(&h->interleaver[i], srslte_cbsegm_cbsize(i)) < 0) {
|
||||
goto clean_and_exit;
|
||||
}
|
||||
srslte_tc_interl_LTE_gen(&h->interleaver[i], srslte_cbsegm_cbsize(i));
|
||||
}
|
||||
h->current_cbidx = -1;
|
||||
ret = 0;
|
||||
clean_and_exit:if (ret == -1) {
|
||||
srslte_tdec_simd_inter_free(h);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_free(srslte_tdec_simd_inter_t * h)
|
||||
{
|
||||
if (h->llr1) {
|
||||
free(h->llr1);
|
||||
}
|
||||
if (h->llr2) {
|
||||
free(h->llr2);
|
||||
}
|
||||
if (h->w) {
|
||||
free(h->w);
|
||||
}
|
||||
if (h->syst0) {
|
||||
free(h->syst0);
|
||||
}
|
||||
if (h->syst1) {
|
||||
free(h->syst1);
|
||||
}
|
||||
if (h->parity0) {
|
||||
free(h->parity0);
|
||||
}
|
||||
if (h->parity1) {
|
||||
free(h->parity1);
|
||||
}
|
||||
if (h->alpha) {
|
||||
free(h->alpha);
|
||||
}
|
||||
|
||||
for (int i=0;i<SRSLTE_NOF_TC_CB_SIZES;i++) {
|
||||
srslte_tc_interl_free(&h->interleaver[i]);
|
||||
}
|
||||
|
||||
bzero(h, sizeof(srslte_tdec_simd_inter_t));
|
||||
}
|
||||
|
||||
|
||||
/* Deinterleave for inter-frame parallelization */
|
||||
void extract_input(srslte_tdec_simd_inter_t *h, int16_t *input, uint32_t cbidx, uint32_t long_cb)
|
||||
{
|
||||
for (int i=0;i<long_cb;i++) {
|
||||
h->syst0[h->max_par_cb*i+cbidx] = input[3*i+0];
|
||||
h->parity0[h->max_par_cb*i+cbidx] = input[3*i+1];
|
||||
h->parity1[h->max_par_cb*i+cbidx] = input[3*i+2];
|
||||
}
|
||||
for (int i = long_cb; i < long_cb + 3; i++) {
|
||||
h->syst0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)];
|
||||
h->syst1[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb)];
|
||||
h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 1];
|
||||
h->parity0[h->max_par_cb*i+cbidx] = input[3*long_cb + 2*(i - long_cb) + 2];
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_iteration(srslte_tdec_simd_inter_t * h, int16_t *input[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
|
||||
if (h->current_cbidx >= 0) {
|
||||
|
||||
uint16_t *inter = h->interleaver[h->current_cbidx].forward;
|
||||
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
|
||||
|
||||
// Prepare systematic and parity bits for MAP DEC #1
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
if (h->n_iter[i] == 0) {
|
||||
extract_input(h, input[i], i, long_cb);
|
||||
}
|
||||
srslte_vec_sum_sss(h->syst0, h->w, h->syst0, long_cb*h->max_par_cb);
|
||||
}
|
||||
|
||||
// Run MAP DEC #1
|
||||
map_sse_inter_dec(h, h->syst0, h->parity0, h->llr1, long_cb);
|
||||
|
||||
// Prepare systematic and parity bits for MAP DEC #1
|
||||
sse_inter_extract_syst1(h, inter, long_cb);
|
||||
|
||||
// Run MAP DEC #2
|
||||
map_sse_inter_dec(h, h->syst1, h->parity1, h->llr2, long_cb);
|
||||
|
||||
// Update a-priori LLR from the last iteration
|
||||
sse_inter_update_w(h, deinter, long_cb);
|
||||
|
||||
} else {
|
||||
fprintf(stderr, "Error CB index not set (call srslte_tdec_simd_inter_reset() first\n");
|
||||
}
|
||||
}
|
||||
|
||||
int srslte_tdec_simd_inter_reset_cb(srslte_tdec_simd_inter_t * h, uint32_t cb_idx)
|
||||
{
|
||||
for (int i=0;i<h->current_long_cb;i++) {
|
||||
h->w[h->max_par_cb*i+cb_idx] = 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int srslte_tdec_simd_inter_reset(srslte_tdec_simd_inter_t * h, uint32_t long_cb)
|
||||
{
|
||||
if (long_cb > h->max_long_cb) {
|
||||
fprintf(stderr, "TDEC was initialized for max_long_cb=%d\n",
|
||||
h->max_long_cb);
|
||||
return -1;
|
||||
}
|
||||
h->current_long_cb = long_cb;
|
||||
h->current_cbidx = srslte_cbsegm_cbindex(long_cb);
|
||||
if (h->current_cbidx < 0) {
|
||||
fprintf(stderr, "Invalid CB length %d\n", long_cb);
|
||||
return -1;
|
||||
}
|
||||
memset(h->w, 0, sizeof(int16_t) * long_cb * h->max_par_cb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_decision_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb)
|
||||
{
|
||||
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
|
||||
uint32_t i;
|
||||
for (i = 0; i < long_cb; i++) {
|
||||
output[i] = (h->llr2[h->max_par_cb*deinter[i]+cb_idx] > 0) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_decision(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
srslte_tdec_simd_inter_decision_cb(h, output[i], i, long_cb);
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_decision_byte_cb(srslte_tdec_simd_inter_t * h, uint8_t *output, uint32_t cb_idx, uint32_t long_cb)
|
||||
{
|
||||
uint32_t i;
|
||||
uint8_t mask[8] = {0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
|
||||
uint16_t *deinter = h->interleaver[h->current_cbidx].reverse;
|
||||
|
||||
#define indexOf_cb(idx, cb) (h->max_par_cb*(deinter[8*i+idx])+cb)
|
||||
|
||||
// long_cb is always byte aligned
|
||||
for (i = 0; i < long_cb/8; i++) {
|
||||
uint8_t out0 = h->llr2[indexOf_cb(0, cb_idx)]>0?mask[0]:0;
|
||||
uint8_t out1 = h->llr2[indexOf_cb(1, cb_idx)]>0?mask[1]:0;
|
||||
uint8_t out2 = h->llr2[indexOf_cb(2, cb_idx)]>0?mask[2]:0;
|
||||
uint8_t out3 = h->llr2[indexOf_cb(3, cb_idx)]>0?mask[3]:0;
|
||||
uint8_t out4 = h->llr2[indexOf_cb(4, cb_idx)]>0?mask[4]:0;
|
||||
uint8_t out5 = h->llr2[indexOf_cb(5, cb_idx)]>0?mask[5]:0;
|
||||
uint8_t out6 = h->llr2[indexOf_cb(6, cb_idx)]>0?mask[6]:0;
|
||||
uint8_t out7 = h->llr2[indexOf_cb(7, cb_idx)]>0?mask[7]:0;
|
||||
|
||||
output[i] = out0 | out1 | out2 | out3 | out4 | out5 | out6 | out7;
|
||||
}
|
||||
}
|
||||
|
||||
void srslte_tdec_simd_inter_decision_byte(srslte_tdec_simd_inter_t * h, uint8_t *output[SRSLTE_TDEC_NPAR], uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
for (int i=0;i<nof_cb;i++) {
|
||||
srslte_tdec_simd_inter_decision_byte_cb(h, output[i], i, long_cb);
|
||||
}
|
||||
}
|
||||
|
||||
int srslte_tdec_simd_inter_run_all(srslte_tdec_simd_inter_t * h,
|
||||
int16_t *input[SRSLTE_TDEC_NPAR], uint8_t *output[SRSLTE_TDEC_NPAR],
|
||||
uint32_t nof_iterations, uint32_t nof_cb, uint32_t long_cb)
|
||||
{
|
||||
uint32_t iter = 0;
|
||||
|
||||
if (srslte_tdec_simd_inter_reset(h, long_cb)) {
|
||||
return SRSLTE_ERROR;
|
||||
}
|
||||
|
||||
do {
|
||||
srslte_tdec_simd_inter_iteration(h, input, nof_cb, long_cb);
|
||||
iter++;
|
||||
} while (iter < nof_iterations);
|
||||
|
||||
srslte_tdec_simd_inter_decision_byte(h, output, nof_cb, long_cb);
|
||||
|
||||
return SRSLTE_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
@ -0,0 +1,198 @@
|
||||
/**
|
||||
*
|
||||
* \section COPYRIGHT
|
||||
*
|
||||
* Copyright 2013-2015 Software Radio Systems Limited
|
||||
*
|
||||
* \section LICENSE
|
||||
*
|
||||
* This file is part of the srsLTE library.
|
||||
*
|
||||
* srsLTE is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* srsLTE is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* A copy of the GNU Affero General Public License can be found in
|
||||
* the LICENSE file in the top-level directory of this distribution
|
||||
* and at http://www.gnu.org/licenses/.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "srslte/phy/fec/turbodecoder_simd_inter.h"
|
||||
#include "srslte/phy/utils/vector.h"
|
||||
|
||||
|
||||
#define NCB 8
|
||||
|
||||
#define INF 10000
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <smmintrin.h>
|
||||
|
||||
void sse_inter_extract_syst1(srslte_tdec_simd_inter_t *h, uint16_t *inter, uint32_t long_cb)
|
||||
{
|
||||
__m128i *llr1Ptr = (__m128i*) h->llr1;
|
||||
__m128i *wPtr = (__m128i*) h->w;
|
||||
__m128i *syst1Ptr = (__m128i*) h->syst1;
|
||||
|
||||
for (int i = 0; i < long_cb; i++) {
|
||||
__m128i llr1 = _mm_load_si128(&llr1Ptr[inter[i]]);
|
||||
__m128i w = _mm_load_si128(&wPtr[inter[i]]);
|
||||
_mm_store_si128(syst1Ptr++, _mm_sub_epi16(llr1, w));
|
||||
}
|
||||
}
|
||||
|
||||
void sse_inter_update_w(srslte_tdec_simd_inter_t *h, uint16_t *deinter, uint32_t long_cb)
|
||||
{
|
||||
__m128i *llr1Ptr = (__m128i*) h->llr1;
|
||||
__m128i *llr2Ptr = (__m128i*) h->llr2;
|
||||
__m128i *wPtr = (__m128i*) h->w;
|
||||
__m128i *syst1Ptr = (__m128i*) h->syst1;
|
||||
|
||||
for (int i = 0; i < long_cb; i++) {
|
||||
__m128i llr1 = _mm_load_si128(llr1Ptr++);
|
||||
__m128i w = _mm_load_si128(wPtr++);
|
||||
__m128i llr2 = _mm_load_si128(&llr2Ptr[deinter[i]]);
|
||||
|
||||
_mm_store_si128(syst1Ptr++, _mm_add_epi16(w, _mm_sub_epi16(llr2, llr1)));
|
||||
}
|
||||
}
|
||||
|
||||
/* Computes beta values */
|
||||
void map_sse_inter_beta(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, int16_t * output, uint32_t long_cb)
|
||||
{
|
||||
__m128i m_b[8], new[8], old[8], max1[8], max0[8];
|
||||
__m128i x, y, xy;
|
||||
__m128i m1, m0;
|
||||
uint32_t end = long_cb + 3;
|
||||
uint32_t i;
|
||||
|
||||
__m128i *inputPtr = (__m128i*) input;
|
||||
__m128i *parityPtr = (__m128i*) parity;
|
||||
__m128i *outputPtr = (__m128i*) output;
|
||||
__m128i *alphaPtr = (__m128i*) s->alpha;
|
||||
|
||||
for (int k = end - 1; k >= 0; k--) {
|
||||
x = _mm_load_si128(inputPtr++);
|
||||
y = _mm_load_si128(parityPtr++);
|
||||
|
||||
xy = _mm_add_epi16(x,y);
|
||||
|
||||
m_b[0] = _mm_add_epi16(old[4], xy);
|
||||
m_b[1] = old[4];
|
||||
m_b[2] = _mm_add_epi16(old[5], y);
|
||||
m_b[3] = _mm_add_epi16(old[5], x);
|
||||
m_b[4] = _mm_add_epi16(old[6], x);
|
||||
m_b[5] = _mm_add_epi16(old[6], y);
|
||||
m_b[6] = old[7];
|
||||
m_b[7] = _mm_add_epi16(old[7], xy);
|
||||
|
||||
new[0] = old[0];
|
||||
new[1] = _mm_add_epi16(old[0], xy);
|
||||
new[2] = _mm_add_epi16(old[1], x);
|
||||
new[3] = _mm_add_epi16(old[1], y);
|
||||
new[4] = _mm_add_epi16(old[2], y);
|
||||
new[5] = _mm_add_epi16(old[2], x);
|
||||
new[6] = _mm_add_epi16(old[3], xy);
|
||||
new[7] = old[3];
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
__m128i alpha = _mm_load_si128(alphaPtr++);
|
||||
max0[i] = _mm_add_epi16(alpha, m_b[i]);
|
||||
max1[i] = _mm_add_epi16(alpha, new[i]);
|
||||
}
|
||||
|
||||
m1 = _mm_max_epi16(max1[0], max1[1]);
|
||||
m0 = _mm_max_epi16(max0[0], max0[1]);
|
||||
|
||||
for (i = 2; i < 8; i++) {
|
||||
m1 = _mm_max_epi16(m1, max1[i]);
|
||||
m0 = _mm_max_epi16(m0, max0[i]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
new[i] = _mm_max_epi16(m_b[i], new[i]);
|
||||
old[i] = new[i];
|
||||
}
|
||||
|
||||
__m128i out = _mm_sub_epi16(m1, m0);
|
||||
_mm_store_si128(outputPtr++, out);
|
||||
|
||||
// normalize
|
||||
if ((k%4)==0) {
|
||||
for (int i=1;i<8;i++) {
|
||||
_mm_sub_epi16(old[i], old[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Computes alpha metrics */
|
||||
void map_see_inter_alpha(srslte_tdec_simd_inter_t * s, int16_t *input, int16_t *parity, uint32_t long_cb)
|
||||
{
|
||||
__m128i m_b[8], new[8], old[8];
|
||||
__m128i x, y, xy;
|
||||
uint32_t k;
|
||||
|
||||
__m128i *inputPtr = (__m128i*) input;
|
||||
__m128i *parityPtr = (__m128i*) parity;
|
||||
__m128i *alphaPtr = (__m128i*) s->alpha;
|
||||
|
||||
old[0] = _mm_set1_epi16(0);
|
||||
for (int i = 1; i < 8; i++) {
|
||||
old[i] = _mm_set1_epi16(-INF);
|
||||
}
|
||||
|
||||
for (k = 0; k < long_cb; k++) {
|
||||
x = _mm_load_si128(inputPtr++);
|
||||
y = _mm_load_si128(parityPtr++);
|
||||
|
||||
xy = _mm_add_epi16(x,y);
|
||||
|
||||
m_b[0] = old[0];
|
||||
m_b[1] = _mm_add_epi16(old[3], y);
|
||||
m_b[2] = _mm_add_epi16(old[4], y);
|
||||
m_b[3] = old[7];
|
||||
m_b[4] = old[1];
|
||||
m_b[5] = _mm_add_epi16(old[2], y);
|
||||
m_b[6] = _mm_add_epi16(old[5], y);
|
||||
m_b[7] = old[6];
|
||||
|
||||
new[0] = _mm_add_epi16(old[1], xy);
|
||||
new[1] = _mm_add_epi16(old[2], x);
|
||||
new[2] = _mm_add_epi16(old[5], x);
|
||||
new[3] = _mm_add_epi16(old[6], xy);
|
||||
new[4] = _mm_add_epi16(old[0], xy);
|
||||
new[5] = _mm_add_epi16(old[3], x);
|
||||
new[6] = _mm_add_epi16(old[4], x);
|
||||
new[7] = _mm_add_epi16(old[7], xy);
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
new[i] = _mm_max_epi16(m_b[i], new[i]);
|
||||
old[i] = new[i];
|
||||
_mm_store_si128(alphaPtr++, old[i]);
|
||||
}
|
||||
|
||||
// normalize
|
||||
if ((k%4)==0) {
|
||||
for (int i=1;i<8;i++) {
|
||||
_mm_sub_epi16(old[i], old[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue