/* -*- c++ -*- */
/*
 * Copyright 2015,2016 Free Software Foundation, Inc.
 *
 * SPDX-License-Identifier: GPL-3.0-or-later
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "dvbt_viterbi_decoder_impl.h"
#include <gnuradio/io_signature.h>

namespace gr {
namespace dtv {

const unsigned char dvbt_viterbi_decoder_impl::d_puncture_1_2[2] = { 1, 1 };
const unsigned char dvbt_viterbi_decoder_impl::d_puncture_2_3[4] = { 1, 1, 0, 1 };
const unsigned char dvbt_viterbi_decoder_impl::d_puncture_3_4[6] = { 1, 1, 0, 1, 1, 0 };
const unsigned char dvbt_viterbi_decoder_impl::d_puncture_5_6[10] = { 1, 1, 0, 1, 1,
                                                                      0, 0, 1, 1, 0 };
const unsigned char dvbt_viterbi_decoder_impl::d_puncture_7_8[14] = {
    1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0
};
/* 8-bit parity lookup table, generated by partab.c */
const unsigned char dvbt_viterbi_decoder_impl::d_Partab[] = {
    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
    0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
    0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
    1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
    0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
    1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
    1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
    1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
    0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
};

#ifdef DTV_SSE2
__GR_ATTR_ALIGNED(16) __m128i dvbt_viterbi_decoder_impl::d_metric0[4];
__GR_ATTR_ALIGNED(16) __m128i dvbt_viterbi_decoder_impl::d_metric1[4];
__GR_ATTR_ALIGNED(16) __m128i dvbt_viterbi_decoder_impl::d_path0[4];
__GR_ATTR_ALIGNED(16) __m128i dvbt_viterbi_decoder_impl::d_path1[4];
#else
__GR_ATTR_ALIGNED(16) unsigned char dvbt_viterbi_decoder_impl::d_metric0_generic[64];
__GR_ATTR_ALIGNED(16) unsigned char dvbt_viterbi_decoder_impl::d_metric1_generic[64];
__GR_ATTR_ALIGNED(16) unsigned char dvbt_viterbi_decoder_impl::d_path0_generic[64];
__GR_ATTR_ALIGNED(16) unsigned char dvbt_viterbi_decoder_impl::d_path1_generic[64];
#endif

#ifdef DTV_SSE2
__GR_ATTR_ALIGNED(16) branchtab27 dvbt_viterbi_decoder_impl::Branchtab27_sse2[2];
#else
__GR_ATTR_ALIGNED(16) branchtab27 dvbt_viterbi_decoder_impl::Branchtab27_generic[2];
#endif

__GR_ATTR_ALIGNED(16) unsigned char dvbt_viterbi_decoder_impl::mmresult[64];
__GR_ATTR_ALIGNED(16)
unsigned char dvbt_viterbi_decoder_impl::ppresult[TRACEBACK_MAX][64];

#ifdef DTV_SSE2
void dvbt_viterbi_decoder_impl::dvbt_viterbi_chunks_init_sse2(__m128i* mm0, __m128i* pp0)
{
#else
void dvbt_viterbi_decoder_impl::dvbt_viterbi_chunks_init_generic(unsigned char* mm0,
                                                                 unsigned char* pp0)
{
#endif
    // Initialize starting metrics to prefer 0 state
    int i, j;

#ifdef DTV_SSE2
    for (i = 0; i < 4; i++) {
        mm0[i] = _mm_setzero_si128();
        pp0[i] = _mm_setzero_si128();
    }

    int polys[2] = { POLYA, POLYB };
    for (i = 0; i < 32; i++) {
        Branchtab27_sse2[0].c[i] =
            (polys[0] < 0) ^ d_Partab[(2 * i) & abs(polys[0])] ? 1 : 0;
        Branchtab27_sse2[1].c[i] =
            (polys[1] < 0) ^ d_Partab[(2 * i) & abs(polys[1])] ? 1 : 0;
    }
#else
    for (i = 0; i < 64; i++) {
        mm0[i] = 0;
        pp0[i] = 0;
    }

    int polys[2] = { POLYA, POLYB };
    for (i = 0; i < 32; i++) {
        Branchtab27_generic[0].c[i] =
            (polys[0] < 0) ^ d_Partab[(2 * i) & abs(polys[0])] ? 1 : 0;
        Branchtab27_generic[1].c[i] =
            (polys[1] < 0) ^ d_Partab[(2 * i) & abs(polys[1])] ? 1 : 0;
    }
#endif

    for (i = 0; i < 64; i++) {
        mmresult[i] = 0;
        for (j = 0; j < TRACEBACK_MAX; j++) {
            ppresult[j][i] = 0;
        }
    }
}

#ifdef DTV_SSE2
void dvbt_viterbi_decoder_impl::dvbt_viterbi_butterfly2_sse2(
    unsigned char* symbols, __m128i* mm0, __m128i* mm1, __m128i* pp0, __m128i* pp1)
{
    int i;

    __m128i *metric0, *metric1;
    __m128i *path0, *path1;

    metric0 = mm0;
    path0 = pp0;
    metric1 = mm1;
    path1 = pp1;

    // Operate on 4 symbols (2 bits) at a time

    __m128i m0, m1, m2, m3, decision0, decision1, survivor0, survivor1;
    __m128i metsv, metsvm;
    __m128i shift0, shift1;
    __m128i tmp0, tmp1;
    __m128i sym0v, sym1v;

    sym0v = _mm_set1_epi8(symbols[0]);
    sym1v = _mm_set1_epi8(symbols[1]);

    for (i = 0; i < 2; i++) {
        if (symbols[0] == 2) {
            metsvm = _mm_xor_si128(Branchtab27_sse2[1].v[i], sym1v);
            metsv = _mm_sub_epi8(_mm_set1_epi8(1), metsvm);
        } else if (symbols[1] == 2) {
            metsvm = _mm_xor_si128(Branchtab27_sse2[0].v[i], sym0v);
            metsv = _mm_sub_epi8(_mm_set1_epi8(1), metsvm);
        } else {
            metsvm = _mm_add_epi8(_mm_xor_si128(Branchtab27_sse2[0].v[i], sym0v),
                                  _mm_xor_si128(Branchtab27_sse2[1].v[i], sym1v));
            metsv = _mm_sub_epi8(_mm_set1_epi8(2), metsvm);
        }

        m0 = _mm_add_epi8(metric0[i], metsv);
        m1 = _mm_add_epi8(metric0[i + 2], metsvm);
        m2 = _mm_add_epi8(metric0[i], metsvm);
        m3 = _mm_add_epi8(metric0[i + 2], metsv);

        decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0, m1), _mm_setzero_si128());
        decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2, m3), _mm_setzero_si128());
        survivor0 =
            _mm_or_si128(_mm_and_si128(decision0, m0), _mm_andnot_si128(decision0, m1));
        survivor1 =
            _mm_or_si128(_mm_and_si128(decision1, m2), _mm_andnot_si128(decision1, m3));

        shift0 = _mm_slli_epi16(path0[i], 1);
        shift1 = _mm_slli_epi16(path0[2 + i], 1);
        shift1 = _mm_add_epi8(shift1, _mm_set1_epi8(1));

        metric1[2 * i] = _mm_unpacklo_epi8(survivor0, survivor1);
        tmp0 = _mm_or_si128(_mm_and_si128(decision0, shift0),
                            _mm_andnot_si128(decision0, shift1));

        metric1[2 * i + 1] = _mm_unpackhi_epi8(survivor0, survivor1);
        tmp1 = _mm_or_si128(_mm_and_si128(decision1, shift0),
                            _mm_andnot_si128(decision1, shift1));

        path1[2 * i] = _mm_unpacklo_epi8(tmp0, tmp1);
        path1[2 * i + 1] = _mm_unpackhi_epi8(tmp0, tmp1);
    }

    metric0 = mm1;
    path0 = pp1;
    metric1 = mm0;
    path1 = pp0;

    sym0v = _mm_set1_epi8(symbols[2]);
    sym1v = _mm_set1_epi8(symbols[3]);

    for (i = 0; i < 2; i++) {
        if (symbols[2] == 2) {
            metsvm = _mm_xor_si128(Branchtab27_sse2[1].v[i], sym1v);
            metsv = _mm_sub_epi8(_mm_set1_epi8(1), metsvm);
        } else if (symbols[3] == 2) {
            metsvm = _mm_xor_si128(Branchtab27_sse2[0].v[i], sym0v);
            metsv = _mm_sub_epi8(_mm_set1_epi8(1), metsvm);
        } else {
            metsvm = _mm_add_epi8(_mm_xor_si128(Branchtab27_sse2[0].v[i], sym0v),
                                  _mm_xor_si128(Branchtab27_sse2[1].v[i], sym1v));
            metsv = _mm_sub_epi8(_mm_set1_epi8(2), metsvm);
        }

        m0 = _mm_add_epi8(metric0[i], metsv);
        m1 = _mm_add_epi8(metric0[i + 2], metsvm);
        m2 = _mm_add_epi8(metric0[i], metsvm);
        m3 = _mm_add_epi8(metric0[i + 2], metsv);

        decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0, m1), _mm_setzero_si128());
        decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2, m3), _mm_setzero_si128());
        survivor0 =
            _mm_or_si128(_mm_and_si128(decision0, m0), _mm_andnot_si128(decision0, m1));
        survivor1 =
            _mm_or_si128(_mm_and_si128(decision1, m2), _mm_andnot_si128(decision1, m3));

        shift0 = _mm_slli_epi16(path0[i], 1);
        shift1 = _mm_slli_epi16(path0[2 + i], 1);
        shift1 = _mm_add_epi8(shift1, _mm_set1_epi8(1));

        metric1[2 * i] = _mm_unpacklo_epi8(survivor0, survivor1);
        tmp0 = _mm_or_si128(_mm_and_si128(decision0, shift0),
                            _mm_andnot_si128(decision0, shift1));

        metric1[2 * i + 1] = _mm_unpackhi_epi8(survivor0, survivor1);
        tmp1 = _mm_or_si128(_mm_and_si128(decision1, shift0),
                            _mm_andnot_si128(decision1, shift1));

        path1[2 * i] = _mm_unpacklo_epi8(tmp0, tmp1);
        path1[2 * i + 1] = _mm_unpackhi_epi8(tmp0, tmp1);
    }
}
#else
void dvbt_viterbi_decoder_impl::dvbt_viterbi_butterfly2_generic(unsigned char* symbols,
                                                                unsigned char* mm0,
                                                                unsigned char* mm1,
                                                                unsigned char* pp0,
                                                                unsigned char* pp1)
{
    int i, j, k;

    unsigned char *metric0, *metric1;
    unsigned char *path0, *path1;

    metric0 = mm0;
    path0 = pp0;
    metric1 = mm1;
    path1 = pp1;

    // Operate on 4 symbols (2 bits) at a time

    unsigned char m0[16], m1[16], m2[16], m3[16], decision0[16], decision1[16],
        survivor0[16], survivor1[16];
    unsigned char metsv[16], metsvm[16];
    unsigned char shift0[16], shift1[16];
    unsigned char tmp0[16], tmp1[16];
    unsigned char sym0v[16], sym1v[16];
    unsigned short simd_epi16;

    for (j = 0; j < 16; j++) {
        sym0v[j] = symbols[0];
        sym1v[j] = symbols[1];
    }

    for (i = 0; i < 2; i++) {
        if (symbols[0] == 2) {
            for (j = 0; j < 16; j++) {
                metsvm[j] = Branchtab27_generic[1].c[(i * 16) + j] ^ sym1v[j];
                metsv[j] = 1 - metsvm[j];
            }
        } else if (symbols[1] == 2) {
            for (j = 0; j < 16; j++) {
                metsvm[j] = Branchtab27_generic[0].c[(i * 16) + j] ^ sym0v[j];
                metsv[j] = 1 - metsvm[j];
            }
        } else {
            for (j = 0; j < 16; j++) {
                metsvm[j] = (Branchtab27_generic[0].c[(i * 16) + j] ^ sym0v[j]) +
                            (Branchtab27_generic[1].c[(i * 16) + j] ^ sym1v[j]);
                metsv[j] = 2 - metsvm[j];
            }
        }

        for (j = 0; j < 16; j++) {
            m0[j] = metric0[(i * 16) + j] + metsv[j];
            m1[j] = metric0[((i + 2) * 16) + j] + metsvm[j];
            m2[j] = metric0[(i * 16) + j] + metsvm[j];
            m3[j] = metric0[((i + 2) * 16) + j] + metsv[j];
        }

        for (j = 0; j < 16; j++) {
            decision0[j] = ((m0[j] - m1[j]) > 0) ? 0xff : 0x0;
            decision1[j] = ((m2[j] - m3[j]) > 0) ? 0xff : 0x0;
            survivor0[j] = (decision0[j] & m0[j]) | ((~decision0[j]) & m1[j]);
            survivor1[j] = (decision1[j] & m2[j]) | ((~decision1[j]) & m3[j]);
        }

        for (j = 0; j < 16; j += 2) {
            simd_epi16 = path0[(i * 16) + j];
            simd_epi16 |= path0[(i * 16) + (j + 1)] << 8;
            simd_epi16 <<= 1;
            shift0[j] = simd_epi16;
            shift0[j + 1] = simd_epi16 >> 8;

            simd_epi16 = path0[((i + 2) * 16) + j];
            simd_epi16 |= path0[((i + 2) * 16) + (j + 1)] << 8;
            simd_epi16 <<= 1;
            shift1[j] = simd_epi16;
            shift1[j + 1] = simd_epi16 >> 8;
        }
        for (j = 0; j < 16; j++) {
            shift1[j] = shift1[j] + 1;
        }

        for (j = 0, k = 0; j < 16; j += 2, k++) {
            metric1[(2 * i * 16) + j] = survivor0[k];
            metric1[(2 * i * 16) + (j + 1)] = survivor1[k];
        }
        for (j = 0; j < 16; j++) {
            tmp0[j] = (decision0[j] & shift0[j]) | ((~decision0[j]) & shift1[j]);
        }

        for (j = 0, k = 8; j < 16; j += 2, k++) {
            metric1[((2 * i + 1) * 16) + j] = survivor0[k];
            metric1[((2 * i + 1) * 16) + (j + 1)] = survivor1[k];
        }
        for (j = 0; j < 16; j++) {
            tmp1[j] = (decision1[j] & shift0[j]) | ((~decision1[j]) & shift1[j]);
        }

        for (j = 0, k = 0; j < 16; j += 2, k++) {
            path1[(2 * i * 16) + j] = tmp0[k];
            path1[(2 * i * 16) + (j + 1)] = tmp1[k];
        }
        for (j = 0, k = 8; j < 16; j += 2, k++) {
            path1[((2 * i + 1) * 16) + j] = tmp0[k];
            path1[((2 * i + 1) * 16) + (j + 1)] = tmp1[k];
        }
    }

    metric0 = mm1;
    path0 = pp1;
    metric1 = mm0;
    path1 = pp0;

    for (j = 0; j < 16; j++) {
        sym0v[j] = symbols[2];
        sym1v[j] = symbols[3];
    }

    for (i = 0; i < 2; i++) {
        if (symbols[2] == 2) {
            for (j = 0; j < 16; j++) {
                metsvm[j] = Branchtab27_generic[1].c[(i * 16) + j] ^ sym1v[j];
                metsv[j] = 1 - metsvm[j];
            }
        } else if (symbols[3] == 2) {
            for (j = 0; j < 16; j++) {
                metsvm[j] = Branchtab27_generic[0].c[(i * 16) + j] ^ sym0v[j];
                metsv[j] = 1 - metsvm[j];
            }
        } else {
            for (j = 0; j < 16; j++) {
                metsvm[j] = (Branchtab27_generic[0].c[(i * 16) + j] ^ sym0v[j]) +
                            (Branchtab27_generic[1].c[(i * 16) + j] ^ sym1v[j]);
                metsv[j] = 2 - metsvm[j];
            }
        }

        for (j = 0; j < 16; j++) {
            m0[j] = metric0[(i * 16) + j] + metsv[j];
            m1[j] = metric0[((i + 2) * 16) + j] + metsvm[j];
            m2[j] = metric0[(i * 16) + j] + metsvm[j];
            m3[j] = metric0[((i + 2) * 16) + j] + metsv[j];
        }

        for (j = 0; j < 16; j++) {
            decision0[j] = ((m0[j] - m1[j]) > 0) ? 0xff : 0x0;
            decision1[j] = ((m2[j] - m3[j]) > 0) ? 0xff : 0x0;
            survivor0[j] = (decision0[j] & m0[j]) | ((~decision0[j]) & m1[j]);
            survivor1[j] = (decision1[j] & m2[j]) | ((~decision1[j]) & m3[j]);
        }

        for (j = 0; j < 16; j += 2) {
            simd_epi16 = path0[(i * 16) + j];
            simd_epi16 |= path0[(i * 16) + (j + 1)] << 8;
            simd_epi16 <<= 1;
            shift0[j] = simd_epi16;
            shift0[j + 1] = simd_epi16 >> 8;

            simd_epi16 = path0[((i + 2) * 16) + j];
            simd_epi16 |= path0[((i + 2) * 16) + (j + 1)] << 8;
            simd_epi16 <<= 1;
            shift1[j] = simd_epi16;
            shift1[j + 1] = simd_epi16 >> 8;
        }
        for (j = 0; j < 16; j++) {
            shift1[j] = shift1[j] + 1;
        }

        for (j = 0, k = 0; j < 16; j += 2, k++) {
            metric1[(2 * i * 16) + j] = survivor0[k];
            metric1[(2 * i * 16) + (j + 1)] = survivor1[k];
        }
        for (j = 0; j < 16; j++) {
            tmp0[j] = (decision0[j] & shift0[j]) | ((~decision0[j]) & shift1[j]);
        }

        for (j = 0, k = 8; j < 16; j += 2, k++) {
            metric1[((2 * i + 1) * 16) + j] = survivor0[k];
            metric1[((2 * i + 1) * 16) + (j + 1)] = survivor1[k];
        }
        for (j = 0; j < 16; j++) {
            tmp1[j] = (decision1[j] & shift0[j]) | ((~decision1[j]) & shift1[j]);
        }

        for (j = 0, k = 0; j < 16; j += 2, k++) {
            path1[(2 * i * 16) + j] = tmp0[k];
            path1[(2 * i * 16) + (j + 1)] = tmp1[k];
        }
        for (j = 0, k = 8; j < 16; j += 2, k++) {
            path1[((2 * i + 1) * 16) + j] = tmp0[k];
            path1[((2 * i + 1) * 16) + (j + 1)] = tmp1[k];
        }
    }
}
#endif

#ifdef DTV_SSE2
unsigned char dvbt_viterbi_decoder_impl::dvbt_viterbi_get_output_sse2(
    __m128i* mm0, __m128i* pp0, int ntraceback, unsigned char* outbuf)
{
#else
unsigned char dvbt_viterbi_decoder_impl::dvbt_viterbi_get_output_generic(
    unsigned char* mm0, unsigned char* pp0, int ntraceback, unsigned char* outbuf)
{
#endif
    //  Find current best path
    int i;
    int bestmetric, minmetric;
    int beststate = 0;
    int pos = 0;
#ifndef DTV_SSE2
    int j;
#endif

    // Implement a circular buffer with the last ntraceback paths
    store_pos = (store_pos + 1) % ntraceback;

#ifdef DTV_SSE2
    // TODO - find another way to extract the value
    for (i = 0; i < 4; i++) {
        _mm_store_si128((__m128i*)&mmresult[i * 16], mm0[i]);
        _mm_store_si128((__m128i*)&ppresult[store_pos][i * 16], pp0[i]);
    }
#else
    for (i = 0; i < 4; i++) {
        for (j = 0; j < 16; j++) {
            mmresult[(i * 16) + j] = mm0[(i * 16) + j];
            ppresult[store_pos][(i * 16) + j] = pp0[(i * 16) + j];
        }
    }
#endif

    // Find out the best final state
    bestmetric = mmresult[beststate];
    minmetric = mmresult[beststate];

    for (i = 1; i < 64; i++) {
        if (mmresult[i] > bestmetric) {
            bestmetric = mmresult[i];
            beststate = i;
        }
        if (mmresult[i] < minmetric) {
            minmetric = mmresult[i];
        }
    }

    // Trace back
    for (i = 0, pos = store_pos; i < (ntraceback - 1); i++) {
        // Obtain the state from the output bits
        // by clocking in the output bits in reverse order.
        // The state has only 6 bits
        beststate = ppresult[pos][beststate] >> 2;
        pos = (pos - 1 + ntraceback) % ntraceback;
    }

    // Store output byte
    *outbuf = ppresult[pos][beststate];

#ifdef DTV_SSE2
    // Zero out the path variable
    // and prevent metric overflow
    for (i = 0; i < 4; i++) {
        pp0[i] = _mm_setzero_si128();
        mm0[i] = _mm_sub_epi8(mm0[i], _mm_set1_epi8(minmetric));
    }
#else
    for (i = 0; i < 4; i++) {
        for (j = 0; j < 16; j++) {
            pp0[(i * 16) + j] = 0;
            mm0[(i * 16) + j] = mm0[(i * 16) + j] - minmetric;
        }
    }
#endif

    return bestmetric;
}

dvbt_viterbi_decoder::sptr dvbt_viterbi_decoder::make(dvb_constellation_t constellation,
                                                      dvbt_hierarchy_t hierarchy,
                                                      dvb_code_rate_t coderate,
                                                      int bsize)
{
    return gnuradio::make_block_sptr<dvbt_viterbi_decoder_impl>(
        constellation, hierarchy, coderate, bsize);
}

/*
 * The private constructor
 */
dvbt_viterbi_decoder_impl::dvbt_viterbi_decoder_impl(dvb_constellation_t constellation,
                                                     dvbt_hierarchy_t hierarchy,
                                                     dvb_code_rate_t coderate,
                                                     int bsize)
    : block("dvbt_viterbi_decoder",
            io_signature::make(1, 1, sizeof(unsigned char)),
            io_signature::make(1, 1, sizeof(unsigned char))),
      config(constellation, hierarchy, coderate, coderate),
      d_k(config.d_cr_k),
      d_n(config.d_cr_n),
      d_m(config.d_m),
      d_bsize(bsize),
      d_nsymbols(d_bsize * d_n / d_m),
      d_nbits(2 * d_k * d_bsize),
      d_inbits(d_nbits)
{
    if (config.d_code_rate_HP == C1_2) {
        d_puncture = d_puncture_1_2;
        d_ntraceback = 5;
    } else if (config.d_code_rate_HP == C2_3) {
        d_puncture = d_puncture_2_3;
        d_ntraceback = 9;
    } else if (config.d_code_rate_HP == C3_4) {
        d_puncture = d_puncture_3_4;
        d_ntraceback = 10;
    } else if (config.d_code_rate_HP == C5_6) {
        d_puncture = d_puncture_5_6;
        d_ntraceback = 15;
    } else if (config.d_code_rate_HP == C7_8) {
        d_puncture = d_puncture_7_8;
        d_ntraceback = 24;
    } else {
        d_puncture = d_puncture_1_2;
        d_ntraceback = 5;
    }

    /*
     * We input n bytes, each carrying m bits => nm bits
     * The result after decoding is km bits, therefore km/8 bytes.
     *
     * out/in rate is therefore km/8n in bytes
     */

    assert((d_bsize * d_n) % d_m == 0);
    set_output_multiple(d_bsize * d_k / 8);

    mettab[0][0] = 1;
    mettab[0][1] = 0;
    mettab[1][0] = 0;
    mettab[1][1] = 1;

#ifdef DTV_SSE2
    dvbt_viterbi_chunks_init_sse2(d_metric0, d_path0);
#else
    dvbt_viterbi_chunks_init_generic(d_metric0_generic, d_path0_generic);
#endif
}

/*
 * Our virtual destructor.
 */
dvbt_viterbi_decoder_impl::~dvbt_viterbi_decoder_impl() {}

void dvbt_viterbi_decoder_impl::forecast(int noutput_items,
                                         gr_vector_int& ninput_items_required)
{
    int input_required = noutput_items * 8 * d_n / (d_k * d_m);

    unsigned ninputs = ninput_items_required.size();
    for (unsigned int i = 0; i < ninputs; i++) {
        ninput_items_required[i] = input_required;
    }
}

int dvbt_viterbi_decoder_impl::general_work(int noutput_items,
                                            gr_vector_int& ninput_items,
                                            gr_vector_const_void_star& input_items,
                                            gr_vector_void_star& output_items)
{
    int nstreams = input_items.size();
    int nblocks = 8 * noutput_items / (d_bsize * d_k);
    int out_count = 0;

    for (int m = 0; m < nstreams; m++) {
        const unsigned char* in = (const unsigned char*)input_items[m];
        unsigned char* out = (unsigned char*)output_items[m];

        /*
         * Look for a tag that signals superframe_start and consume all input items
         * that are in input buffer so far.
         * This will actually reset the viterbi decoder.
         */
        std::vector<tag_t> tags;
        const uint64_t nread = this->nitems_read(0); // number of items read on port 0
        this->get_tags_in_range(tags,
                                0,
                                nread,
                                nread + (nblocks * d_nsymbols),
                                pmt::string_to_symbol("superframe_start"));

        if (!tags.empty()) {
            d_init = 0;

#ifdef DTV_SSE2
            dvbt_viterbi_chunks_init_sse2(d_metric0, d_path0);
#else
            dvbt_viterbi_chunks_init_generic(d_metric0_generic, d_path0_generic);
#endif

            if (tags[0].offset - nread) {
                consume_each(tags[0].offset - nread);
                return (0);
            }
        }

        // This is actually the Viterbi decoder
        for (int n = 0; n < nblocks; n++) {
            /*
             * Depuncture and unpack a block.
             * We receive the symbol (d_m bits/byte) in one byte (e.g. for QAM16
             * 00001111). Create a buffer of bytes containing just one bit/byte. Also
             * depuncture according to the puncture vector.
             * TODO - reduce the number of branches while depuncturing.
             */
            for (int count = 0, i = 0; i < d_nsymbols; i++) {
                for (int j = (d_m - 1); j >= 0; j--) {
                    // Depuncture
                    while (d_puncture[count % (2 * d_k)] == 0) {
                        d_inbits[count++] = 2;
                    }

                    // Insert received bits
                    d_inbits[count++] = (in[(n * d_nsymbols) + i] >> j) & 1;

                    // Depuncture
                    while (d_puncture[count % (2 * d_k)] == 0) {
                        d_inbits[count++] = 2;
                    }
                }
            }

            /*
             * Decode a block.
             */
            for (int in_count = 0; in_count < d_nbits; in_count++) {
                if ((in_count % 4) == 0) { // 0 or 3

#ifdef DTV_SSE2
                    dvbt_viterbi_butterfly2_sse2(&d_inbits[in_count & 0xfffffffc],
                                                 d_metric0,
                                                 d_metric1,
                                                 d_path0,
                                                 d_path1);
#else
                    dvbt_viterbi_butterfly2_generic(&d_inbits[in_count & 0xfffffffc],
                                                    d_metric0_generic,
                                                    d_metric1_generic,
                                                    d_path0_generic,
                                                    d_path1_generic);
#endif

                    if ((in_count > 0) && (in_count % 16) == 8) { // 8 or 11
                        unsigned char c;

#ifdef DTV_SSE2
                        dvbt_viterbi_get_output_sse2(
                            d_metric0, d_path0, d_ntraceback, &c);
#else
                        dvbt_viterbi_get_output_generic(
                            d_metric0_generic, d_path0_generic, d_ntraceback, &c);
#endif

                        if (d_init == 0) {
                            if (out_count >= d_ntraceback) {
                                out[out_count - d_ntraceback] = c;
                            }
                        } else {
                            out[out_count] = c;
                        }
                        out_count++;
                    }
                }
            }
        }
    }

    int to_out = noutput_items;

    if (d_init == 0) {
        /*
         * Send superframe_start to signal this situation
         * downstream
         */
        const uint64_t offset = this->nitems_written(0);
        pmt::pmt_t key = pmt::string_to_symbol("superframe_start");
        pmt::pmt_t value = pmt::from_long(1);
        this->add_item_tag(0, offset, key, value);

        // Take in consideration the traceback length
        to_out = to_out - d_ntraceback;
        d_init = 1;
    }

    // Tell runtime system how many input items we consumed on
    // each input stream.
    consume_each(nblocks * d_nsymbols);

    // Tell runtime system how many output items we produced.
    return (to_out);
}

} /* namespace dtv */
} /* namespace gr */