summaryrefslogtreecommitdiff
path: root/gr-vocoder/lib/codec2/nlp.c
diff options
context:
space:
mode:
Diffstat (limited to 'gr-vocoder/lib/codec2/nlp.c')
-rw-r--r--gr-vocoder/lib/codec2/nlp.c285
1 files changed, 255 insertions, 30 deletions
diff --git a/gr-vocoder/lib/codec2/nlp.c b/gr-vocoder/lib/codec2/nlp.c
index 0d5e530ce6..cca835b46f 100644
--- a/gr-vocoder/lib/codec2/nlp.c
+++ b/gr-vocoder/lib/codec2/nlp.c
@@ -28,7 +28,9 @@
#include "defines.h"
#include "nlp.h"
#include "dump.h"
-#include "fft.h"
+#include "kiss_fft.h"
+#undef TIMER
+#include "machdep.h"
#include <assert.h>
#include <math.h>
@@ -51,6 +53,8 @@
#define CNLP 0.3 /* post processor constant */
#define NLP_NTAP 48 /* Decimation LPF order */
+#undef DUMP
+
/*---------------------------------------------------------------------------*\
GLOBALS
@@ -111,12 +115,16 @@ const float nlp_fir[] = {
};
typedef struct {
- float sq[PMAX_M]; /* squared speech samples */
- float mem_x,mem_y; /* memory for notch filter */
- float mem_fir[NLP_NTAP]; /* decimation FIR filter memory */
+ int m;
+ float w[PMAX_M/DEC]; /* DFT window */
+ float sq[PMAX_M]; /* squared speech samples */
+ float mem_x,mem_y; /* memory for notch filter */
+ float mem_fir[NLP_NTAP]; /* decimation FIR filter memory */
+ kiss_fft_cfg fft_cfg; /* kiss FFT config */
} NLP;
-float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax);
+float test_candidate_mbe(COMP Sw[], COMP W[], float f0);
+float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo);
float post_process_sub_multiples(COMP Fw[],
int pmin, int pmax, float gmax, int gmax_bin,
float *prev_Wo);
@@ -129,15 +137,24 @@ float post_process_sub_multiples(COMP Fw[],
\*---------------------------------------------------------------------------*/
-void *nlp_create()
+void *nlp_create(
+int m /* analysis window size */
+)
{
NLP *nlp;
int i;
+ assert(m <= PMAX_M);
+
nlp = (NLP*)malloc(sizeof(NLP));
if (nlp == NULL)
return NULL;
+ nlp->m = m;
+ for(i=0; i<m/DEC; i++) {
+ nlp->w[i] = 0.5 - 0.5*cosf(2*PI*i/(m/DEC-1));
+ }
+
for(i=0; i<PMAX_M; i++)
nlp->sq[i] = 0.0;
nlp->mem_x = 0.0;
@@ -145,20 +162,27 @@ void *nlp_create()
for(i=0; i<NLP_NTAP; i++)
nlp->mem_fir[i] = 0.0;
+ nlp->fft_cfg = kiss_fft_alloc (PE_FFT_SIZE, 0, NULL, NULL);
+ assert(nlp->fft_cfg != NULL);
+
return (void*)nlp;
}
/*---------------------------------------------------------------------------*\
- nlp_destory()
+ nlp_destroy()
- Initialisation function for NLP pitch estimator.
+ Shut down function for NLP pitch estimator.
\*---------------------------------------------------------------------------*/
void nlp_destroy(void *nlp_state)
{
+ NLP *nlp;
assert(nlp_state != NULL);
+ nlp = (NLP*)nlp_state;
+
+ KISS_FFT_FREE(nlp->fft_cfg);
free(nlp_state);
}
@@ -196,28 +220,33 @@ float nlp(
void *nlp_state,
float Sn[], /* input speech vector */
int n, /* frames shift (no. new samples in Sn[]) */
- int m, /* analysis window size */
- int pmin, /* minimum pitch value */
+ int pmin, /* minimum pitch value */
int pmax, /* maximum pitch value */
float *pitch, /* estimated pitch period in samples */
COMP Sw[], /* Freq domain version of Sn[] */
+ COMP W[], /* Freq domain window */
float *prev_Wo
)
{
NLP *nlp;
- float notch; /* current notch filter output */
- COMP Fw[PE_FFT_SIZE]; /* DFT of squared signal */
+ float notch; /* current notch filter output */
+ COMP fw[PE_FFT_SIZE]; /* DFT of squared signal (input) */
+ COMP Fw[PE_FFT_SIZE]; /* DFT of squared signal (output) */
float gmax;
int gmax_bin;
- int i,j;
- float best_f0;
+ int m, i,j;
+ float best_f0;
+ TIMER_VAR(start, tnotch, filter, peakpick, window, fft, magsq, shiftmem);
assert(nlp_state != NULL);
nlp = (NLP*)nlp_state;
+ m = nlp->m;
+
+ TIMER_SAMPLE(start);
/* Square, notch filter at DC, and LP filter vector */
- for(i=m-n; i<M; i++) /* square latest speech samples */
+ for(i=m-n; i<m; i++) /* square latest speech samples */
nlp->sq[i] = Sn[i]*Sn[i];
for(i=m-n; i<m; i++) { /* notch filter at DC */
@@ -225,9 +254,18 @@ float nlp(
notch += COEFF*nlp->mem_y;
nlp->mem_x = nlp->sq[i];
nlp->mem_y = notch;
- nlp->sq[i] = notch;
+ nlp->sq[i] = notch + 1.0; /* With 0 input vectors to codec,
+ kiss_fft() would take a long
+ time to execute when running in
+ real time. Problem was traced
+ to kiss_fft function call in
+ this function. Adding this small
+ constant fixed problem. Not
+ exactly sure why. */
}
+ TIMER_SAMPLE_AND_LOG(tnotch, start, " square and notch");
+
for(i=m-n; i<m; i++) { /* FIR filter vector */
for(j=0; j<NLP_NTAP-1; j++)
@@ -239,26 +277,33 @@ float nlp(
nlp->sq[i] += nlp->mem_fir[j]*nlp_fir[j];
}
+ TIMER_SAMPLE_AND_LOG(filter, tnotch, " filter");
+
/* Decimate and DFT */
for(i=0; i<PE_FFT_SIZE; i++) {
- Fw[i].real = 0.0;
- Fw[i].imag = 0.0;
+ fw[i].real = 0.0;
+ fw[i].imag = 0.0;
}
for(i=0; i<m/DEC; i++) {
- Fw[i].real = nlp->sq[i*DEC]*(0.5 - 0.5*cos(2*PI*i/(m/DEC-1)));
+ fw[i].real = nlp->sq[i*DEC]*nlp->w[i];
}
-#ifdef DUMP
+ TIMER_SAMPLE_AND_LOG(window, filter, " window");
+ #ifdef DUMP
dump_dec(Fw);
-#endif
- fft(&Fw[0].real,PE_FFT_SIZE,1);
+ #endif
+
+ kiss_fft(nlp->fft_cfg, (kiss_fft_cpx *)fw, (kiss_fft_cpx *)Fw);
+ TIMER_SAMPLE_AND_LOG(fft, window, " fft");
+
for(i=0; i<PE_FFT_SIZE; i++)
Fw[i].real = Fw[i].real*Fw[i].real + Fw[i].imag*Fw[i].imag;
-#ifdef DUMP
+ TIMER_SAMPLE_AND_LOG(magsq, fft, " mag sq");
+ #ifdef DUMP
dump_sq(nlp->sq);
dump_Fw(Fw);
-#endif
+ #endif
/* find global peak */
@@ -271,8 +316,16 @@ float nlp(
}
}
- best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin,
- prev_Wo);
+ TIMER_SAMPLE_AND_LOG(peakpick, magsq, " peak pick");
+
+ //#define POST_PROCESS_MBE
+ #ifdef POST_PROCESS_MBE
+ best_f0 = post_process_mbe(Fw, pmin, pmax, gmax, Sw, W, prev_Wo);
+ #else
+ best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_Wo);
+ #endif
+
+ TIMER_SAMPLE_AND_LOG(shiftmem, peakpick, " post process");
/* Shift samples in buffer to make room for new samples */
@@ -282,6 +335,11 @@ float nlp(
/* return pitch and F0 estimate */
*pitch = (float)SAMPLE_RATE/best_f0;
+
+ TIMER_SAMPLE_AND_LOG2(shiftmem, " shift mem");
+
+ TIMER_SAMPLE_AND_LOG2(start, " nlp int");
+
return(best_f0);
}
@@ -289,7 +347,7 @@ float nlp(
post_process_sub_multiples()
- Given the global maximma of Fw[] we search interger submultiples for
+ Given the global maximma of Fw[] we search integer submultiples for
local maxima. If local maxima exist and they are above an
experimentally derived threshold (OK a magic number I pulled out of
the air) we choose the submultiple as the F0 estimate.
@@ -314,7 +372,7 @@ float post_process_sub_multiples(COMP Fw[],
int mult;
float thresh, best_f0;
int b, bmin, bmax, lmax_bin;
- float lmax, cmax;
+ float lmax;
int prev_f0_bin;
/* post process estimate by searching submultiples */
@@ -342,7 +400,7 @@ float post_process_sub_multiples(COMP Fw[],
lmax = 0;
lmax_bin = bmin;
- for (b=bmin; b<=bmax; b++) /* look for maximum in interval */
+ for (b=bmin; b<=bmax; b++) /* look for maximum in interval */
if (Fw[b].real > lmax) {
lmax = Fw[b].real;
lmax_bin = b;
@@ -350,7 +408,6 @@ float post_process_sub_multiples(COMP Fw[],
if (lmax > thresh)
if ((lmax > Fw[lmax_bin-1].real) && (lmax > Fw[lmax_bin+1].real)) {
- cmax = lmax;
cmax_bin = lmax_bin;
}
@@ -362,3 +419,171 @@ float post_process_sub_multiples(COMP Fw[],
return best_f0;
}
+/*---------------------------------------------------------------------------*\
+
+ post_process_mbe()
+
+ Use the MBE pitch estimation algorithm to evaluate pitch candidates. This
+ works OK but the accuracy at low F0 is affected by NW, the analysis window
+ size used for the DFT of the input speech Sw[]. Also favours high F0 in
+ the presence of background noise which causes periodic artifacts in the
+ synthesised speech.
+
+\*---------------------------------------------------------------------------*/
+
+float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo)
+{
+ float candidate_f0;
+ float f0,best_f0; /* fundamental frequency */
+ float e,e_min; /* MBE cost function */
+ int i;
+ #ifdef DUMP
+ float e_hz[F0_MAX];
+ #endif
+ #if !defined(NDEBUG) || defined(DUMP)
+ int bin;
+ #endif
+ float f0_min, f0_max;
+ float f0_start, f0_end;
+
+ f0_min = (float)SAMPLE_RATE/pmax;
+ f0_max = (float)SAMPLE_RATE/pmin;
+
+ /* Now look for local maxima. Each local maxima is a candidate
+ that we test using the MBE pitch estimation algotithm */
+
+ #ifdef DUMP
+ for(i=0; i<F0_MAX; i++)
+ e_hz[i] = -1;
+ #endif
+ e_min = 1E32;
+ best_f0 = 50;
+ for(i=PE_FFT_SIZE*DEC/pmax; i<=PE_FFT_SIZE*DEC/pmin; i++) {
+ if ((Fw[i].real > Fw[i-1].real) && (Fw[i].real > Fw[i+1].real)) {
+
+ /* local maxima found, lets test if it's big enough */
+
+ if (Fw[i].real > T*gmax) {
+
+ /* OK, sample MBE cost function over +/- 10Hz range in 2.5Hz steps */
+
+ candidate_f0 = (float)i*SAMPLE_RATE/(PE_FFT_SIZE*DEC);
+ f0_start = candidate_f0-20;
+ f0_end = candidate_f0+20;
+ if (f0_start < f0_min) f0_start = f0_min;
+ if (f0_end > f0_max) f0_end = f0_max;
+
+ for(f0=f0_start; f0<=f0_end; f0+= 2.5) {
+ e = test_candidate_mbe(Sw, W, f0);
+ #if !defined(NDEBUG) || defined(DUMP)
+ bin = floor(f0); assert((bin > 0) && (bin < F0_MAX));
+ #endif
+ #ifdef DUMP
+ e_hz[bin] = e;
+ #endif
+ if (e < e_min) {
+ e_min = e;
+ best_f0 = f0;
+ }
+ }
+
+ }
+ }
+ }
+
+ /* finally sample MBE cost function around previous pitch estimate
+ (form of pitch tracking) */
+
+ candidate_f0 = *prev_Wo * SAMPLE_RATE/TWO_PI;
+ f0_start = candidate_f0-20;
+ f0_end = candidate_f0+20;
+ if (f0_start < f0_min) f0_start = f0_min;
+ if (f0_end > f0_max) f0_end = f0_max;
+
+ for(f0=f0_start; f0<=f0_end; f0+= 2.5) {
+ e = test_candidate_mbe(Sw, W, f0);
+ #if !defined(NDEBUG) || defined(DUMP)
+ bin = floor(f0); assert((bin > 0) && (bin < F0_MAX));
+ #endif
+ #ifdef DUMP
+ e_hz[bin] = e;
+ #endif
+ if (e < e_min) {
+ e_min = e;
+ best_f0 = f0;
+ }
+ }
+
+ #ifdef DUMP
+ dump_e(e_hz);
+ #endif
+
+ return best_f0;
+}
+
+/*---------------------------------------------------------------------------*\
+
+ test_candidate_mbe()
+
+ Returns the error of the MBE cost function for the input f0.
+
+ Note: I think a lot of the operations below can be simplified as
+ W[].imag = 0 and has been normalised such that den always equals 1.
+
+\*---------------------------------------------------------------------------*/
+
+float test_candidate_mbe(
+ COMP Sw[],
+ COMP W[],
+ float f0
+)
+{
+ COMP Sw_[FFT_ENC]; /* DFT of all voiced synthesised signal */
+ int l,al,bl,m; /* loop variables */
+ COMP Am; /* amplitude sample for this band */
+ int offset; /* centers Hw[] about current harmonic */
+ float den; /* denominator of Am expression */
+ float error; /* accumulated error between originl and synthesised */
+ float Wo; /* current "test" fundamental freq. */
+ int L;
+
+ L = floor((SAMPLE_RATE/2.0)/f0);
+ Wo = f0*(2*PI/SAMPLE_RATE);
+
+ error = 0.0;
+
+ /* Just test across the harmonics in the first 1000 Hz (L/4) */
+
+ for(l=1; l<L/4; l++) {
+ Am.real = 0.0;
+ Am.imag = 0.0;
+ den = 0.0;
+ al = ceil((l - 0.5)*Wo*FFT_ENC/TWO_PI);
+ bl = ceil((l + 0.5)*Wo*FFT_ENC/TWO_PI);
+
+ /* Estimate amplitude of harmonic assuming harmonic is totally voiced */
+
+ for(m=al; m<bl; m++) {
+ offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5;
+ Am.real += Sw[m].real*W[offset].real + Sw[m].imag*W[offset].imag;
+ Am.imag += Sw[m].imag*W[offset].real - Sw[m].real*W[offset].imag;
+ den += W[offset].real*W[offset].real + W[offset].imag*W[offset].imag;
+ }
+
+ Am.real = Am.real/den;
+ Am.imag = Am.imag/den;
+
+ /* Determine error between estimated harmonic and original */
+
+ for(m=al; m<bl; m++) {
+ offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5;
+ Sw_[m].real = Am.real*W[offset].real - Am.imag*W[offset].imag;
+ Sw_[m].imag = Am.real*W[offset].imag + Am.imag*W[offset].real;
+ error += (Sw[m].real - Sw_[m].real)*(Sw[m].real - Sw_[m].real);
+ error += (Sw[m].imag - Sw_[m].imag)*(Sw[m].imag - Sw_[m].imag);
+ }
+ }
+
+ return error;
+}
+