You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1746 lines
48 KiB
C++

3 years ago
/*---------------------------------------------------------------------------*\
FILE........: codec2.c
AUTHOR......: David Rowe
DATE CREATED: 21/8/2010
Codec2 fully quantised encoder and decoder functions. If you want use
codec2, the codec2_xxx functions are for you.
\*---------------------------------------------------------------------------*/
/*
Copyright (C) 2010 David Rowe
All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License version 2.1, as
published by the free Software Foundation. This program is
distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <math.h>
#include "nlp.h"
#include "lpc.h"
#include "quantise.h"
#include "codec2_api.h"
3 years ago
#include "codec2_internal.h"
#define HPF_BETA 0.125
#define BPF_N 101
CKissFFT kiss;
/*---------------------------------------------------------------------------* \
FUNCTION HEADERS
\*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*\
FUNCTIONS
\*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*\
FUNCTION....: codec2_create
AUTHOR......: David Rowe
DATE CREATED: 21/8/2010
Create and initialise an instance of the codec. Returns a pointer
to the codec states or NULL on failure. One set of states is
sufficient for a full duuplex codec (i.e. an encoder and decoder).
You don't need separate states for encoders and decoders. See
c2enc.c and c2dec.c for examples.
\*---------------------------------------------------------------------------*/
CCodec2::CCodec2(bool is_3200)
{
c2.mode = is_3200 ? 3200 : 1600;
/* store constants in a few places for convenience */
c2.c2const = c2const_create(8000, N_S);
c2.Fs = c2.c2const.Fs;
int n_samp = c2.n_samp = c2.c2const.n_samp;
int m_pitch = c2.m_pitch = c2.c2const.m_pitch;
c2.Pn.resize(2*n_samp);
c2.Sn_.resize(2*n_samp);
c2.w.resize(m_pitch);
c2.Sn.resize(m_pitch);
for(int i=0; i<m_pitch; i++)
c2.Sn[i] = 1.0;
c2.hpf_states[0] = c2.hpf_states[1] = 0.0;
for(int i=0; i<2*n_samp; i++)
c2.Sn_[i] = 0;
kiss.fft_alloc(c2.fft_fwd_cfg, FFT_ENC, false);
kiss.fftr_alloc(c2.fftr_fwd_cfg, FFT_ENC, false);
make_analysis_window(&c2.c2const, &c2.fft_fwd_cfg, c2.w.data(), c2.W);
make_synthesis_window(&c2.c2const, c2.Pn.data());
kiss.fftr_alloc(c2.fftr_inv_cfg, FFT_DEC, true);
c2.prev_f0_enc = 1/P_MAX_S;
c2.bg_est = 0.0;
c2.ex_phase = 0.0;
for(int l=1; l<=MAX_AMP; l++)
c2.prev_model_dec.A[l] = 0.0;
c2.prev_model_dec.Wo = TWO_PI/c2.c2const.p_max;
c2.prev_model_dec.L = PI/c2.prev_model_dec.Wo;
c2.prev_model_dec.voiced = 0;
for(int i=0; i<LPC_ORD; i++)
{
c2.prev_lsps_dec[i] = i*PI/(LPC_ORD+1);
}
c2.prev_e_dec = 1;
nlp.nlp_create(&c2.c2const);
c2.lpc_pf = 1;
c2.bass_boost = 1;
c2.beta = LPCPF_BETA;
c2.gamma = LPCPF_GAMMA;
c2.xq_enc[0] = c2.xq_enc[1] = 0.0;
c2.xq_dec[0] = c2.xq_dec[1] = 0.0;
c2.smoothing = 0;
c2.bpf_buf.resize(BPF_N+4*c2.n_samp);
for(int i=0; i<BPF_N+4*c2.n_samp; i++)
c2.bpf_buf[i] = 0.0;
c2.softdec = NULL;
c2.gray = 1;
// make sure that one of the two decode function pointers is empty
// for the encode function pointer this is not required since we always set it
// to a meaningful value
decode = NULL;
if ( 3200 == c2.mode)
{
encode = &CCodec2::codec2_encode_3200;
decode = &CCodec2::codec2_decode_3200;
}
else
{
encode = &CCodec2::codec2_encode_1600;
decode = &CCodec2::codec2_decode_1600;
}
}
/*---------------------------------------------------------------------------*\
FUNCTION....: codec2_destroy
AUTHOR......: David Rowe
DATE CREATED: 21/8/2010
Destroy an instance of the codec.
\*---------------------------------------------------------------------------*/
CCodec2::~CCodec2()
{
c2.bpf_buf.clear();
nlp.nlp_destroy();
c2.fft_fwd_cfg.twiddles.clear();
c2.fftr_fwd_cfg.substate.twiddles.clear();
c2.fftr_fwd_cfg.tmpbuf.clear();
c2.fftr_fwd_cfg.super_twiddles.clear();
c2.fftr_inv_cfg.substate.twiddles.clear();
c2.fftr_inv_cfg.tmpbuf.clear();
c2.fftr_inv_cfg.super_twiddles.clear();
c2.Pn.clear();
c2.Sn.clear();
c2.w.clear();
c2.Sn_.clear();
}
void CCodec2::codec2_set_mode(bool m)
{
c2.mode = m ? 3200 : 1600;
if (c2.mode == 3200){
encode = &CCodec2::codec2_encode_3200;
decode = &CCodec2::codec2_decode_3200;
}
else{
encode = &CCodec2::codec2_encode_1600;
decode = &CCodec2::codec2_decode_1600;
}
}
/*---------------------------------------------------------------------------*\
FUNCTION....: codec2_bits_per_frame
AUTHOR......: David Rowe
DATE CREATED: Nov 14 2011
Returns the number of bits per frame.
\*---------------------------------------------------------------------------*/
int CCodec2::codec2_bits_per_frame()
{
return 64;
}
/*---------------------------------------------------------------------------*\
FUNCTION....: codec2_samples_per_frame
AUTHOR......: David Rowe
DATE CREATED: Nov 14 2011
Returns the number of speech samples per frame.
\*---------------------------------------------------------------------------*/
int CCodec2::codec2_samples_per_frame()
{
if (3200 == c2.mode)
return 160;
else
return 320;
return 0; /* shouldnt get here */
}
void CCodec2::codec2_encode(unsigned char *bits, const short *speech)
{
assert(encode != NULL);
(*this.*encode)(bits, speech);
}
void CCodec2::codec2_decode(short *speech, const unsigned char *bits)
{
assert(decode != NULL);
(*this.*decode)(speech, bits);
}
/*---------------------------------------------------------------------------*\
FUNCTION....: codec2_encode_3200
AUTHOR......: David Rowe
DATE CREATED: 13 Sep 2012
Encodes 160 speech samples (20ms of speech) into 64 bits.
The codec2 algorithm actually operates internally on 10ms (80
sample) frames, so we run the encoding algorithm twice. On the
first frame we just send the voicing bits. On the second frame we
send all model parameters. Compared to 2400 we use a larger number
of bits for the LSPs and non-VQ pitch and energy.
The bit allocation is:
Parameter bits/frame
--------------------------------------
Harmonic magnitudes (LSPs) 50
Pitch (Wo) 7
Energy 5
Voicing (10ms update) 2
TOTAL 64
\*---------------------------------------------------------------------------*/
void CCodec2::codec2_encode_3200(unsigned char *bits, const short *speech)
{
MODEL model;
float ak[LPC_ORD+1];
float lsps[LPC_ORD];
float e;
int Wo_index, e_index;
int lspd_indexes[LPC_ORD];
int i;
unsigned int nbit = 0;
memset(bits, '\0', ((codec2_bits_per_frame() + 7) / 8));
/* first 10ms analysis frame - we just want voicing */
analyse_one_frame(&model, speech);
qt.pack(bits, &nbit, model.voiced, 1);
/* second 10ms analysis frame */
analyse_one_frame(&model, &speech[c2.n_samp]);
qt.pack(bits, &nbit, model.voiced, 1);
Wo_index = qt.encode_Wo(&c2.c2const, model.Wo, WO_BITS);
qt.pack(bits, &nbit, Wo_index, WO_BITS);
e = qt.speech_to_uq_lsps(lsps, ak, c2.Sn.data(), c2.w.data(), c2.m_pitch, LPC_ORD);
e_index = qt.encode_energy(e, E_BITS);
qt.pack(bits, &nbit, e_index, E_BITS);
qt.encode_lspds_scalar(lspd_indexes, lsps, LPC_ORD);
for(i=0; i<LSPD_SCALAR_INDEXES; i++)
{
qt.pack(bits, &nbit, lspd_indexes[i], qt.lspd_bits(i));
}
assert(nbit == (unsigned)codec2_bits_per_frame());
}
/*---------------------------------------------------------------------------*\
FUNCTION....: codec2_decode_3200
AUTHOR......: David Rowe
DATE CREATED: 13 Sep 2012
Decodes a frame of 64 bits into 160 samples (20ms) of speech.
\*---------------------------------------------------------------------------*/
void CCodec2::codec2_decode_3200(short speech[], const unsigned char * bits)
{
MODEL model[2];
int lspd_indexes[LPC_ORD];
float lsps[2][LPC_ORD];
int Wo_index, e_index;
float e[2];
float snr;
float ak[2][LPC_ORD+1];
int i,j;
unsigned int nbit = 0;
std::complex<float> Aw[FFT_ENC];
/* only need to zero these out due to (unused) snr calculation */
for(i=0; i<2; i++)
for(j=1; j<=MAX_AMP; j++)
model[i].A[j] = 0.0;
/* unpack bits from channel ------------------------------------*/
/* this will partially fill the model params for the 2 x 10ms
frames */
model[0].voiced = qt.unpack(bits, &nbit, 1);
model[1].voiced = qt.unpack(bits, &nbit, 1);
Wo_index = qt.unpack(bits, &nbit, WO_BITS);
model[1].Wo = qt.decode_Wo(&c2.c2const, Wo_index, WO_BITS);
model[1].L = PI/model[1].Wo;
e_index = qt.unpack(bits, &nbit, E_BITS);
e[1] = qt.decode_energy(e_index, E_BITS);
for(i=0; i<LSPD_SCALAR_INDEXES; i++)
{
lspd_indexes[i] = qt.unpack(bits, &nbit, qt.lspd_bits(i));
}
qt.decode_lspds_scalar(&lsps[1][0], lspd_indexes, LPC_ORD);
/* interpolate ------------------------------------------------*/
/* Wo and energy are sampled every 20ms, so we interpolate just 1
10ms frame between 20ms samples */
interp_Wo(&model[0], &c2.prev_model_dec, &model[1], c2.c2const.Wo_min);
e[0] = interp_energy(c2.prev_e_dec, e[1]);
/* LSPs are sampled every 20ms so we interpolate the frame in
between, then recover spectral amplitudes */
interpolate_lsp_ver2(&lsps[0][0], c2.prev_lsps_dec, &lsps[1][0], 0.5, LPC_ORD);
for(i=0; i<2; i++)
{
lsp_to_lpc(&lsps[i][0], &ak[i][0], LPC_ORD);
qt.aks_to_M2(&(c2.fftr_fwd_cfg), &ak[i][0], LPC_ORD, &model[i], e[i], &snr, 0, c2.lpc_pf, c2.bass_boost, c2.beta, c2.gamma, Aw);
qt.apply_lpc_correction(&model[i]);
synthesise_one_frame(&speech[c2.n_samp*i], &model[i], Aw, 3.0);
}
/* update memories for next frame ----------------------------*/
c2.prev_model_dec = model[1];
c2.prev_e_dec = e[1];
for(i=0; i<LPC_ORD; i++)
c2.prev_lsps_dec[i] = lsps[1][i];
}
/*---------------------------------------------------------------------------*\
FUNCTION....: codec2_encode_1600
AUTHOR......: David Rowe
DATE CREATED: Feb 28 2013
Encodes 320 speech samples (40ms of speech) into 64 bits.
The codec2 algorithm actually operates internally on 10ms (80
sample) frames, so we run the encoding algorithm 4 times:
frame 0: voicing bit
frame 1: voicing bit, Wo and E
frame 2: voicing bit
frame 3: voicing bit, Wo and E, scalar LSPs
The bit allocation is:
Parameter frame 2 frame 4 Total
-------------------------------------------------------
Harmonic magnitudes (LSPs) 0 36 36
Pitch (Wo) 7 7 14
Energy 5 5 10
Voicing (10ms update) 2 2 4
TOTAL 14 50 64
\*---------------------------------------------------------------------------*/
void CCodec2::codec2_encode_1600(unsigned char * bits, const short speech[])
{
MODEL model;
float lsps[LPC_ORD];
float ak[LPC_ORD+1];
float e;
int lsp_indexes[LPC_ORD];
int Wo_index, e_index;
int i;
unsigned int nbit = 0;
memset(bits, '\0', ((codec2_bits_per_frame() + 7) / 8));
/* frame 1: - voicing ---------------------------------------------*/
analyse_one_frame(&model, speech);
qt.pack(bits, &nbit, model.voiced, 1);
/* frame 2: - voicing, scalar Wo & E -------------------------------*/
analyse_one_frame(&model, &speech[c2.n_samp]);
qt.pack(bits, &nbit, model.voiced, 1);
Wo_index = qt.encode_Wo(&c2.c2const, model.Wo, WO_BITS);
qt.pack(bits, &nbit, Wo_index, WO_BITS);
/* need to run this just to get LPC energy */
e = qt.speech_to_uq_lsps(lsps, ak, c2.Sn.data(), c2.w.data(), c2.m_pitch, LPC_ORD);
e_index = qt.encode_energy(e, E_BITS);
qt.pack(bits, &nbit, e_index, E_BITS);
/* frame 3: - voicing ---------------------------------------------*/
analyse_one_frame(&model, &speech[2*c2.n_samp]);
qt.pack(bits, &nbit, model.voiced, 1);
/* frame 4: - voicing, scalar Wo & E, scalar LSPs ------------------*/
analyse_one_frame(&model, &speech[3*c2.n_samp]);
qt.pack(bits, &nbit, model.voiced, 1);
Wo_index = qt.encode_Wo(&c2.c2const, model.Wo, WO_BITS);
qt.pack(bits, &nbit, Wo_index, WO_BITS);
e = qt.speech_to_uq_lsps(lsps, ak, c2.Sn.data(), c2.w.data(), c2.m_pitch, LPC_ORD);
e_index = qt.encode_energy(e, E_BITS);
qt.pack(bits, &nbit, e_index, E_BITS);
qt.encode_lsps_scalar(lsp_indexes, lsps, LPC_ORD);
for(i=0; i<LSP_SCALAR_INDEXES; i++)
{
qt.pack(bits, &nbit, lsp_indexes[i], qt.lsp_bits(i));
}
assert(nbit == (unsigned)codec2_bits_per_frame());
}
/*---------------------------------------------------------------------------*\
FUNCTION....: codec2_decode_1600
AUTHOR......: David Rowe
DATE CREATED: 11 May 2012
Decodes frames of 64 bits into 320 samples (40ms) of speech.
\*---------------------------------------------------------------------------*/
void CCodec2::codec2_decode_1600(short speech[], const unsigned char * bits)
{
MODEL model[4];
int lsp_indexes[LPC_ORD];
float lsps[4][LPC_ORD];
int Wo_index, e_index;
float e[4];
float snr;
float ak[4][LPC_ORD+1];
int i,j;
unsigned int nbit = 0;
float weight;
std::complex<float> Aw[FFT_ENC];
/* only need to zero these out due to (unused) snr calculation */
for(i=0; i<4; i++)
for(j=1; j<=MAX_AMP; j++)
model[i].A[j] = 0.0;
/* unpack bits from channel ------------------------------------*/
/* this will partially fill the model params for the 4 x 10ms
frames */
model[0].voiced = qt.unpack(bits, &nbit, 1);
model[1].voiced = qt.unpack(bits, &nbit, 1);
Wo_index = qt.unpack(bits, &nbit, WO_BITS);
model[1].Wo = qt.decode_Wo(&c2.c2const, Wo_index, WO_BITS);
model[1].L = PI/model[1].Wo;
e_index = qt.unpack(bits, &nbit, E_BITS);
e[1] = qt.decode_energy(e_index, E_BITS);
model[2].voiced = qt.unpack(bits, &nbit, 1);
model[3].voiced = qt.unpack(bits, &nbit, 1);
Wo_index = qt.unpack(bits, &nbit, WO_BITS);
model[3].Wo = qt.decode_Wo(&c2.c2const, Wo_index, WO_BITS);
model[3].L = PI/model[3].Wo;
e_index = qt.unpack(bits, &nbit, E_BITS);
e[3] = qt.decode_energy(e_index, E_BITS);
for(i=0; i<LSP_SCALAR_INDEXES; i++)
{
lsp_indexes[i] = qt.unpack(bits, &nbit, qt.lsp_bits(i));
}
qt.decode_lsps_scalar(&lsps[3][0], lsp_indexes, LPC_ORD);
qt.check_lsp_order(&lsps[3][0], LPC_ORD);
qt.bw_expand_lsps(&lsps[3][0], LPC_ORD, 50.0, 100.0);
/* interpolate ------------------------------------------------*/
/* Wo and energy are sampled every 20ms, so we interpolate just 1
10ms frame between 20ms samples */
interp_Wo(&model[0], &c2.prev_model_dec, &model[1], c2.c2const.Wo_min);
e[0] = interp_energy(c2.prev_e_dec, e[1]);
interp_Wo(&model[2], &model[1], &model[3], c2.c2const.Wo_min);
e[2] = interp_energy(e[1], e[3]);
/* LSPs are sampled every 40ms so we interpolate the 3 frames in
between, then recover spectral amplitudes */
for(i=0, weight=0.25; i<3; i++, weight += 0.25)
{
interpolate_lsp_ver2(&lsps[i][0], c2.prev_lsps_dec, &lsps[3][0], weight, LPC_ORD);
}
for(i=0; i<4; i++)
{
lsp_to_lpc(&lsps[i][0], &ak[i][0], LPC_ORD);
qt.aks_to_M2(&(c2.fftr_fwd_cfg), &ak[i][0], LPC_ORD, &model[i], e[i], &snr, 0, c2.lpc_pf, c2.bass_boost, c2.beta, c2.gamma, Aw);
qt.apply_lpc_correction(&model[i]);
synthesise_one_frame(&speech[c2.n_samp*i], &model[i], Aw, 3.0);
}
/* update memories for next frame ----------------------------*/
c2.prev_model_dec = model[3];
c2.prev_e_dec = e[3];
for(i=0; i<LPC_ORD; i++)
c2.prev_lsps_dec[i] = lsps[3][i];
}
/*---------------------------------------------------------------------------* \
FUNCTION....: synthesise_one_frame()
AUTHOR......: David Rowe
DATE CREATED: 23/8/2010
Synthesise 80 speech samples (10ms) from model parameters.
\*---------------------------------------------------------------------------*/
void CCodec2::synthesise_one_frame(short speech[], MODEL *model, std::complex<float> Aw[], float gain)
{
int i;
/* LPC based phase synthesis */
std::complex<float> H[MAX_AMP+1];
sample_phase(model, H, Aw);
phase_synth_zero_order(c2.n_samp, model, &c2.ex_phase, H);
postfilter(model, &c2.bg_est);
synthesise(c2.n_samp, &(c2.fftr_inv_cfg), c2.Sn_.data(), model, c2.Pn.data(), 1);
for(i=0; i<c2.n_samp; i++)
{
c2.Sn_[i] *= gain;
}
ear_protection(c2.Sn_.data(), c2.n_samp);
for(i=0; i<c2.n_samp; i++)
{
if (c2.Sn_[i] > 32767.0)
speech[i] = 32767;
else if (c2.Sn_[i] < -32767.0)
speech[i] = -32767;
else
speech[i] = c2.Sn_[i];
}
}
/*---------------------------------------------------------------------------* \
FUNCTION....: analyse_one_frame()
AUTHOR......: David Rowe
DATE CREATED: 23/8/2010
Extract sinusoidal model parameters from 80 speech samples (10ms of
speech).
\*---------------------------------------------------------------------------*/
void CCodec2::analyse_one_frame(MODEL *model, const short *speech)
{
std::complex<float> Sw[FFT_ENC];
float pitch;
int i;
int n_samp = c2.n_samp;
int m_pitch = c2.m_pitch;
/* Read input speech */
for(i=0; i<m_pitch-n_samp; i++)
c2.Sn[i] = c2.Sn[i+n_samp];
for(i=0; i<n_samp; i++)
c2.Sn[i+m_pitch-n_samp] = speech[i];
dft_speech(&c2.c2const, c2.fft_fwd_cfg, Sw, c2.Sn.data(), c2.w.data());
/* Estimate pitch */
nlp.nlp(c2.Sn.data(), n_samp, &pitch, &c2.prev_f0_enc);
model->Wo = TWO_PI/pitch;
model->L = PI/model->Wo;
/* estimate model parameters */
two_stage_pitch_refinement(&c2.c2const, model, Sw);
/* estimate phases when doing ML experiments */
estimate_amplitudes(model, Sw, 0);
est_voicing_mbe(&c2.c2const, model, Sw, c2.W);
}
/*---------------------------------------------------------------------------* \
FUNCTION....: ear_protection()
AUTHOR......: David Rowe
DATE CREATED: Nov 7 2012
Limits output level to protect ears when there are bit errors or the input
is overdriven. This doesn't correct or mask bit errors, just reduces the
worst of their damage.
\*---------------------------------------------------------------------------*/
void CCodec2::ear_protection(float in_out[], int n)
{
float max_sample, over, gain;
int i;
/* find maximum sample in frame */
max_sample = 0.0;
for(i=0; i<n; i++)
if (in_out[i] > max_sample)
max_sample = in_out[i];
/* determine how far above set point */
over = max_sample/30000.0;
/* If we are x dB over set point we reduce level by 2x dB, this
attenuates major excursions in amplitude (likely to be caused
by bit errors) more than smaller ones */
if (over > 1.0)
{
gain = 1.0/(over*over);
for(i=0; i<n; i++)
in_out[i] *= gain;
}
}
/*---------------------------------------------------------------------------*\
sample_phase()
Samples phase at centre of each harmonic from and array of FFT_ENC
DFT samples.
\*---------------------------------------------------------------------------*/
void CCodec2::sample_phase(MODEL *model,
std::complex<float> H[],
std::complex<float> A[] /* LPC analysis filter in freq domain */
)
{
int m, b;
float r;
r = TWO_PI/(FFT_ENC);
/* Sample phase at harmonics */
for(m=1; m<=model->L; m++)
{
b = (int)(m*model->Wo/r + 0.5);
H[m] = std::conj(A[b]);
}
}
/*---------------------------------------------------------------------------*\
phase_synth_zero_order()
Synthesises phases based on SNR and a rule based approach. No phase
parameters are required apart from the SNR (which can be reduced to a
1 bit V/UV decision per frame).
The phase of each harmonic is modelled as the phase of a synthesis
filter excited by an impulse. In many Codec 2 modes the synthesis
filter is a LPC filter. Unlike the first order model the position
of the impulse is not transmitted, so we create an excitation pulse
train using a rule based approach.
Consider a pulse train with a pulse starting time n=0, with pulses
repeated at a rate of Wo, the fundamental frequency. A pulse train
in the time domain is equivalent to harmonics in the frequency
domain. We can make an excitation pulse train using a sum of
sinsusoids:
for(m=1; m<=L; m++)
ex[n] = cos(m*Wo*n)
Note: the Octave script ../octave/phase.m is an example of this if
you would like to try making a pulse train.
The phase of each excitation harmonic is:
arg(E[m]) = mWo
where E[m] are the complex excitation (freq domain) samples,
arg(x), just returns the phase of a complex sample x.
As we don't transmit the pulse position for this model, we need to
synthesise it. Now the excitation pulses occur at a rate of Wo.
This means the phase of the first harmonic advances by N_SAMP samples
over a synthesis frame of N_SAMP samples. For example if Wo is pi/20
(200 Hz), then over a 10ms frame (N_SAMP=80 samples), the phase of the
first harmonic would advance (pi/20)*80 = 4*pi or two complete
cycles.
We generate the excitation phase of the fundamental (first
harmonic):
arg[E[1]] = Wo*N_SAMP;
We then relate the phase of the m-th excitation harmonic to the
phase of the fundamental as:
arg(E[m]) = m*arg(E[1])
This E[m] then gets passed through the LPC synthesis filter to
determine the final harmonic phase.
Comparing to speech synthesised using original phases:
- Through headphones speech synthesised with this model is not as
good. Through a loudspeaker it is very close to original phases.
- If there are voicing errors, the speech can sound clicky or
staticy. If V speech is mistakenly declared UV, this model tends to
synthesise impulses or clicks, as there is usually very little shift or
dispersion through the LPC synthesis filter.
- When combined with LPC amplitude modelling there is an additional
drop in quality. I am not sure why, theory is interformant energy
is raised making any phase errors more obvious.
NOTES:
1/ This synthesis model is effectively the same as a simple LPC-10
vocoders, and yet sounds much better. Why? Conventional wisdom
(AMBE, MELP) says mixed voicing is required for high quality
speech.
2/ I am pretty sure the Lincoln Lab sinusoidal coding guys (like xMBE
also from MIT) first described this zero phase model, I need to look
up the paper.
3/ Note that this approach could cause some discontinuities in
the phase at the edge of synthesis frames, as no attempt is made
to make sure that the phase tracks are continuous (the excitation
phases are continuous, but not the final phases after filtering
by the LPC spectra). Technically this is a bad thing. However
this may actually be a good thing, disturbing the phase tracks a
bit. More research needed, e.g. test a synthesis model that adds
a small delta-W to make phase tracks line up for voiced
harmonics.
\*---------------------------------------------------------------------------*/
void CCodec2::phase_synth_zero_order(
int n_samp,
MODEL *model,
float *ex_phase, /* excitation phase of fundamental */
std::complex<float> H[] /* L synthesis filter freq domain samples */
)
{
int m;
float new_phi;
std::complex<float> Ex[MAX_AMP+1]; /* excitation samples */
std::complex<float> A_[MAX_AMP+1]; /* synthesised harmonic samples */
/*
Update excitation fundamental phase track, this sets the position
of each pitch pulse during voiced speech. After much experiment
I found that using just this frame's Wo improved quality for UV
sounds compared to interpolating two frames Wo like this:
ex_phase[0] += (*prev_Wo+model->Wo)*N_SAMP/2;
*/
ex_phase[0] += (model->Wo)*n_samp;
ex_phase[0] -= TWO_PI*floorf(ex_phase[0]/TWO_PI + 0.5);
for(m=1; m<=model->L; m++)
{
/* generate excitation */
if (model->voiced)
{
Ex[m] = std::polar(1.0f, ex_phase[0] * m);
}
else
{
/* When a few samples were tested I found that LPC filter
phase is not needed in the unvoiced case, but no harm in
keeping it.
*/
float phi = TWO_PI*(float)codec2_rand()/CODEC2_RAND_MAX;
Ex[m] = std::polar(1.0f, phi);
}
/* filter using LPC filter */
A_[m].real(H[m].real() * Ex[m].real() - H[m].imag() * Ex[m].imag());
A_[m].imag(H[m].imag() * Ex[m].real() + H[m].real() * Ex[m].imag());
/* modify sinusoidal phase */
new_phi = atan2f(A_[m].imag(), A_[m].real()+1E-12);
model->phi[m] = new_phi;
}
}
/*---------------------------------------------------------------------------*\
postfilter()
The post filter is designed to help with speech corrupted by
background noise. The zero phase model tends to make speech with
background noise sound "clicky". With high levels of background
noise the low level inter-formant parts of the spectrum will contain
noise rather than speech harmonics, so modelling them as voiced
(i.e. a continuous, non-random phase track) is inaccurate.
Some codecs (like MBE) have a mixed voicing model that breaks the
spectrum into voiced and unvoiced regions. Several bits/frame
(5-12) are required to transmit the frequency selective voicing
information. Mixed excitation also requires accurate voicing
estimation (parameter estimators always break occasionally under
exceptional conditions).
In our case we use a post filter approach which requires no
additional bits to be transmitted. The decoder measures the average
level of the background noise during unvoiced frames. If a harmonic
is less than this level it is made unvoiced by randomising it's
phases.
This idea is rather experimental. Some potential problems that may
happen:
1/ If someone says "aaaaaaaahhhhhhhhh" will background estimator track
up to speech level? This would be a bad thing.
2/ If background noise suddenly dissapears from the source speech does
estimate drop quickly? What is noise suddenly re-appears?
3/ Background noise with a non-flat sepctrum. Current algorithm just
comsiders spectrum as a whole, but this could be broken up into
bands, each with their own estimator.
4/ Males and females with the same level of background noise. Check
performance the same. Changing Wo affects width of each band, may
affect bg energy estimates.
5/ Not sure what happens during long periods of voiced speech
e.g. "sshhhhhhh"
\*---------------------------------------------------------------------------*/
#define BG_THRESH 40.0 // only consider low levels signals for bg_est
#define BG_BETA 0.1 // averaging filter constant
#define BG_MARGIN 6.0 // harmonics this far above BG noise are
// randomised. Helped make bg noise less
// spikey (impulsive) for mmt1, but speech was
// perhaps a little rougher.
void CCodec2::postfilter( MODEL *model, float *bg_est )
{
int m, uv;
float e, thresh;
/* determine average energy across spectrum */
e = 1E-12;
for(m=1; m<=model->L; m++)
e += model->A[m]*model->A[m];
assert(e > 0.0);
e = 10.0*log10f(e/model->L);
/* If beneath threhold, update bg estimate. The idea
of the threshold is to prevent updating during high level
speech. */
if ((e < BG_THRESH) && !model->voiced)
*bg_est = *bg_est*(1.0 - BG_BETA) + e*BG_BETA;
/* now mess with phases during voiced frames to make any harmonics
less then our background estimate unvoiced.
*/
uv = 0;
thresh = exp10f((*bg_est + BG_MARGIN)/20.0);
if (model->voiced)
for(m=1; m<=model->L; m++)
if (model->A[m] < thresh)
{
model->phi[m] = (TWO_PI/CODEC2_RAND_MAX)*(float)codec2_rand();
uv++;
}
}
C2CONST CCodec2::c2const_create(int Fs, float framelength_s)
{
C2CONST c2const;
assert((Fs == 8000) || (Fs = 16000));
c2const.Fs = Fs;
c2const.n_samp = round(Fs*framelength_s);
c2const.max_amp = floor(Fs*P_MAX_S/2);
c2const.p_min = floor(Fs*P_MIN_S);
c2const.p_max = floor(Fs*P_MAX_S);
c2const.m_pitch = floor(Fs*M_PITCH_S);
c2const.Wo_min = TWO_PI/c2const.p_max;
c2const.Wo_max = TWO_PI/c2const.p_min;
if (Fs == 8000)
{
c2const.nw = 279;
}
else
{
c2const.nw = 511; /* actually a bit shorter in time but lets us maintain constant FFT size */
}
c2const.tw = Fs*TW_S;
/*
fprintf(stderr, "max_amp: %d m_pitch: %d\n", c2const.n_samp, c2const.m_pitch);
fprintf(stderr, "p_min: %d p_max: %d\n", c2const.p_min, c2const.p_max);
fprintf(stderr, "Wo_min: %f Wo_max: %f\n", c2const.Wo_min, c2const.Wo_max);
fprintf(stderr, "nw: %d tw: %d\n", c2const.nw, c2const.tw);
*/
return c2const;
}
/*---------------------------------------------------------------------------*\
FUNCTION....: make_analysis_window
AUTHOR......: David Rowe
DATE CREATED: 11/5/94
Init function that generates the time domain analysis window and it's DFT.
\*---------------------------------------------------------------------------*/
void CCodec2::make_analysis_window(C2CONST *c2const, FFT_STATE *fft_fwd_cfg, float w[], float W[])
{
float m;
std::complex<float> wshift[FFT_ENC];
int i,j;
int m_pitch = c2const->m_pitch;
int nw = c2const->nw;
/*
Generate Hamming window centered on M-sample pitch analysis window
0 M/2 M-1
|-------------|-------------|
|-------|-------|
nw samples
All our analysis/synthsis is centred on the M/2 sample.
*/
m = 0.0;
for(i=0; i<m_pitch/2-nw/2; i++)
w[i] = 0.0;
for(i=m_pitch/2-nw/2,j=0; i<m_pitch/2+nw/2; i++,j++)
{
w[i] = 0.5 - 0.5*cosf(TWO_PI*j/(nw-1));
m += w[i]*w[i];
}
for(i=m_pitch/2+nw/2; i<m_pitch; i++)
w[i] = 0.0;
/* Normalise - makes freq domain amplitude estimation straight
forward */
m = 1.0/sqrtf(m*FFT_ENC);
for(i=0; i<m_pitch; i++)
{
w[i] *= m;
}
/*
Generate DFT of analysis window, used for later processing. Note
we modulo FFT_ENC shift the time domain window w[], this makes the
imaginary part of the DFT W[] equal to zero as the shifted w[] is
even about the n=0 time axis if nw is odd. Having the imag part
of the DFT W[] makes computation easier.
0 FFT_ENC-1
|-------------------------|
----\ /----
\ /
\ / <- shifted version of window w[n]
\ /
\ /
-------
|---------| |---------|
nw/2 nw/2
*/
std::complex<float> temp[FFT_ENC];
for(i=0; i<FFT_ENC; i++)
{
wshift[i] = std::complex<float>(0.0f, 0.0f);
}
for(i=0; i<nw/2; i++)
wshift[i].real(w[i+m_pitch/2]);
for(i=FFT_ENC-nw/2,j=m_pitch/2-nw/2; i<FFT_ENC; i++,j++)
wshift[i].real(w[j]);
kiss.fft(*fft_fwd_cfg, wshift, temp);
/*
Re-arrange W[] to be symmetrical about FFT_ENC/2. Makes later
analysis convenient.
Before:
0 FFT_ENC-1
|----------|---------|
__ _
\ /
\_______________/
After:
0 FFT_ENC-1
|----------|---------|
___
/ \
________/ \_______
*/
for(i=0; i<FFT_ENC/2; i++)
{
W[i] = temp[i + FFT_ENC / 2].real();
W[i + FFT_ENC / 2] = temp[i].real();
}
}
/*---------------------------------------------------------------------------*\
FUNCTION....: dft_speech
AUTHOR......: David Rowe
DATE CREATED: 27/5/94
Finds the DFT of the current speech input speech frame.
\*---------------------------------------------------------------------------*/
void CCodec2::dft_speech(C2CONST *c2const, FFT_STATE &fft_fwd_cfg, std::complex<float> Sw[], float Sn[], float w[])
{
int i;
int m_pitch = c2const->m_pitch;
int nw = c2const->nw;
for(i=0; i<FFT_ENC; i++) {
Sw[i] = std::complex<float>(0.0f, 0.0f);
}
/* Centre analysis window on time axis, we need to arrange input
to FFT this way to make FFT phases correct */
/* move 2nd half to start of FFT input vector */
for(i=0; i<nw/2; i++)
Sw[i].real(Sn[i+m_pitch/2]*w[i+m_pitch/2]);
/* move 1st half to end of FFT input vector */
for(i=0; i<nw/2; i++)
Sw[FFT_ENC-nw/2+i].real(Sn[i+m_pitch/2-nw/2]*w[i+m_pitch/2-nw/2]);
nlp.codec2_fft_inplace(fft_fwd_cfg, Sw);
}
/*---------------------------------------------------------------------------*\
FUNCTION....: two_stage_pitch_refinement
AUTHOR......: David Rowe
DATE CREATED: 27/5/94
Refines the current pitch estimate using the harmonic sum pitch
estimation technique.
\*---------------------------------------------------------------------------*/
void CCodec2::two_stage_pitch_refinement(C2CONST *c2const, MODEL *model, std::complex<float> Sw[])
{
float pmin,pmax,pstep; /* pitch refinment minimum, maximum and step */
/* Coarse refinement */
pmax = TWO_PI/model->Wo + 5;
pmin = TWO_PI/model->Wo - 5;
pstep = 1.0;
hs_pitch_refinement(model, Sw, pmin, pmax, pstep);
/* Fine refinement */
pmax = TWO_PI/model->Wo + 1;
pmin = TWO_PI/model->Wo - 1;
pstep = 0.25;
hs_pitch_refinement(model,Sw,pmin,pmax,pstep);
/* Limit range */
if (model->Wo < TWO_PI/c2const->p_max)
model->Wo = TWO_PI/c2const->p_max;
if (model->Wo > TWO_PI/c2const->p_min)
model->Wo = TWO_PI/c2const->p_min;
model->L = floorf(PI/model->Wo);
/* trap occasional round off issues with floorf() */
if (model->Wo*model->L >= 0.95*PI)
{
model->L--;
}
assert(model->Wo*model->L < PI);
}
/*---------------------------------------------------------------------------*\
FUNCTION....: hs_pitch_refinement
AUTHOR......: David Rowe
DATE CREATED: 27/5/94
Harmonic sum pitch refinement function.
pmin pitch search range minimum
pmax pitch search range maximum
step pitch search step size
model current pitch estimate in model.Wo
model refined pitch estimate in model.Wo
\*---------------------------------------------------------------------------*/
void CCodec2::hs_pitch_refinement(MODEL *model, std::complex<float> Sw[], float pmin, float pmax, float pstep)
{
int m; /* loop variable */
int b; /* bin for current harmonic centre */
float E; /* energy for current pitch*/
float Wo; /* current "test" fundamental freq. */
float Wom; /* Wo that maximises E */
float Em; /* mamimum energy */
float r, one_on_r; /* number of rads/bin */
float p; /* current pitch */
/* Initialisation */
model->L = PI/model->Wo; /* use initial pitch est. for L */
Wom = model->Wo;
Em = 0.0;
r = TWO_PI/FFT_ENC;
one_on_r = 1.0/r;
/* Determine harmonic sum for a range of Wo values */
for(p=pmin; p<=pmax; p+=pstep)
{
E = 0.0;
Wo = TWO_PI/p;
/* Sum harmonic magnitudes */
for(m=1; m<=model->L; m++)
{
b = (int)(m*Wo*one_on_r + 0.5);
E += Sw[b].real() * Sw[b].real() + Sw[b].imag() * Sw[b].imag();
}
/* Compare to see if this is a maximum */
if (E > Em)
{
Em = E;
Wom = Wo;
}
}
model->Wo = Wom;
}
/*---------------------------------------------------------------------------*\
FUNCTION....: estimate_amplitudes
AUTHOR......: David Rowe
DATE CREATED: 27/5/94
Estimates the complex amplitudes of the harmonics.
\*---------------------------------------------------------------------------*/
void CCodec2::estimate_amplitudes(MODEL *model, std::complex<float> Sw[], int est_phase)
{
int i,m; /* loop variables */
int am,bm; /* bounds of current harmonic */
float den; /* denominator of amplitude expression */
float r = TWO_PI/FFT_ENC;
float one_on_r = 1.0/r;
for(m=1; m<=model->L; m++)
{
/* Estimate ampltude of harmonic */
den = 0.0;
am = (int)((m - 0.5)*model->Wo*one_on_r + 0.5);
bm = (int)((m + 0.5)*model->Wo*one_on_r + 0.5);
for(i=am; i<bm; i++)
{
den += Sw[i].real() * Sw[i].real() + Sw[i].imag() * Sw[i].imag();
}
model->A[m] = sqrtf(den);
if (est_phase)
{
int b = (int)(m*model->Wo/r + 0.5); /* DFT bin of centre of current harmonic */
/* Estimate phase of harmonic, this is expensive in CPU for
embedded devicesso we make it an option */
model->phi[m] = atan2f(Sw[b].imag(), Sw[b].real());
}
}
}
/*---------------------------------------------------------------------------*\
est_voicing_mbe()
Returns the error of the MBE cost function for a fiven F0.
Note: I think a lot of the operations below can be simplified as
W[].imag = 0 and has been normalised such that den always equals 1.
\*---------------------------------------------------------------------------*/
float CCodec2::est_voicing_mbe( C2CONST *c2const, MODEL *model, std::complex<float> Sw[], float W[])
{
int l,al,bl,m; /* loop variables */
std::complex<float> Am; /* amplitude sample for this band */
int offset; /* centers Hw[] about current harmonic */
float den; /* denominator of Am expression */
float error; /* accumulated error between original and synthesised */
float Wo;
float sig, snr;
float elow, ehigh, eratio;
float sixty;
std::complex<float> Ew(0, 0);
int l_1000hz = model->L*1000.0/(c2const->Fs/2);
sig = 1E-4;
for(l=1; l<=l_1000hz; l++)
{
sig += model->A[l]*model->A[l];
}
Wo = model->Wo;
error = 1E-4;
/* Just test across the harmonics in the first 1000 Hz */
for(l=1; l<=l_1000hz; l++)
{
Am = std::complex<float>(0.0f, 0.0f);
den = 0.0;
al = ceilf((l - 0.5)*Wo*FFT_ENC/TWO_PI);
bl = ceilf((l + 0.5)*Wo*FFT_ENC/TWO_PI);
/* Estimate amplitude of harmonic assuming harmonic is totally voiced */
offset = FFT_ENC/2 - l*Wo*FFT_ENC/TWO_PI + 0.5;
for(m=al; m<bl; m++)
{
Am += W[offset+m] * Sw[m];
den += W[offset+m]*W[offset+m];
}
Am /= den;
/* Determine error between estimated harmonic and original */
for(m=al; m<bl; m++)
{
Ew = Sw[m] - (W[offset+m] * Am);
error += Ew.real() * Ew.real() + Ew.imag() * Ew.imag();
}
}
snr = 10.0*log10f(sig/error);
if (snr > V_THRESH)
model->voiced = 1;
else
model->voiced = 0;
/* post processing, helps clean up some voicing errors ------------------*/
/*
Determine the ratio of low freqency to high frequency energy,
voiced speech tends to be dominated by low frequency energy,
unvoiced by high frequency. This measure can be used to
determine if we have made any gross errors.
*/
int l_2000hz = model->L*2000.0/(c2const->Fs/2);
int l_4000hz = model->L*4000.0/(c2const->Fs/2);
elow = ehigh = 1E-4;
for(l=1; l<=l_2000hz; l++)
{
elow += model->A[l]*model->A[l];
}
for(l=l_2000hz; l<=l_4000hz; l++)
{
ehigh += model->A[l]*model->A[l];
}
eratio = 10.0*log10f(elow/ehigh);
/* Look for Type 1 errors, strongly V speech that has been
accidentally declared UV */
if (model->voiced == 0)
if (eratio > 10.0)
model->voiced = 1;
/* Look for Type 2 errors, strongly UV speech that has been
accidentally declared V */
if (model->voiced == 1)
{
if (eratio < -10.0)
model->voiced = 0;
/* A common source of Type 2 errors is the pitch estimator
gives a low (50Hz) estimate for UV speech, which gives a
good match with noise due to the close harmoonic spacing.
These errors are much more common than people with 50Hz3
pitch, so we have just a small eratio threshold. */
sixty = 60.0*TWO_PI/c2const->Fs;
if ((eratio < -4.0) && (model->Wo <= sixty))
model->voiced = 0;
}
//printf(" v: %d snr: %f eratio: %3.2f %f\n",model->voiced,snr,eratio,dF0);
return snr;
}
/*---------------------------------------------------------------------------*\
FUNCTION....: make_synthesis_window
AUTHOR......: David Rowe
DATE CREATED: 11/5/94
Init function that generates the trapezoidal (Parzen) sythesis window.
\*---------------------------------------------------------------------------*/
void CCodec2::make_synthesis_window(C2CONST *c2const, float Pn[])
{
int i;
float win;
int n_samp = c2const->n_samp;
int tw = c2const->tw;
/* Generate Parzen window in time domain */
win = 0.0;
for(i=0; i<n_samp/2-tw; i++)
Pn[i] = 0.0;
win = 0.0;
for(i=n_samp/2-tw; i<n_samp/2+tw; win+=1.0/(2*tw), i++ )
Pn[i] = win;
for(i=n_samp/2+tw; i<3*n_samp/2-tw; i++)
Pn[i] = 1.0;
win = 1.0;
for(i=3*n_samp/2-tw; i<3*n_samp/2+tw; win-=1.0/(2*tw), i++)
Pn[i] = win;
for(i=3*n_samp/2+tw; i<2*n_samp; i++)
Pn[i] = 0.0;
}
/*---------------------------------------------------------------------------*\
FUNCTION....: synthesise
AUTHOR......: David Rowe
DATE CREATED: 20/2/95
Synthesise a speech signal in the frequency domain from the
sinusodal model parameters. Uses overlap-add with a trapezoidal
window to smoothly interpolate betwen frames.
\*---------------------------------------------------------------------------*/
void CCodec2::synthesise(
int n_samp,
FFTR_STATE *fftr_inv_cfg,
float Sn_[], /* time domain synthesised signal */
MODEL *model, /* ptr to model parameters for this frame */
float Pn[], /* time domain Parzen window */
int shift /* flag used to handle transition frames */
)
{
int i,l,j,b; /* loop variables */
std::complex<float> Sw_[FFT_DEC/2+1]; /* DFT of synthesised signal */
float sw_[FFT_DEC]; /* synthesised signal */
if (shift)
{
/* Update memories */
for(i=0; i<n_samp-1; i++)
{
Sn_[i] = Sn_[i+n_samp];
}
Sn_[n_samp-1] = 0.0;
}
for(i=0; i<FFT_DEC/2+1; i++)
{
Sw_[i].real(0);
Sw_[i].imag(0);
}
/* Now set up frequency domain synthesised speech */
for(l=1; l<=model->L; l++)
{
b = (int)(l*model->Wo*FFT_DEC/TWO_PI + 0.5);
if (b > ((FFT_DEC/2)-1))
{
b = (FFT_DEC/2)-1;
}
Sw_[b] = std::polar(model->A[l], model->phi[l]);
}
/* Perform inverse DFT */
kiss.fftri(*fftr_inv_cfg, Sw_,sw_);
/* Overlap add to previous samples */
for(i=0; i<n_samp-1; i++)
{
Sn_[i] += sw_[FFT_DEC-n_samp+1+i]*Pn[i];
}
if (shift)
for(i=n_samp-1,j=0; i<2*n_samp; i++,j++)
Sn_[i] = sw_[j]*Pn[i];
else
for(i=n_samp-1,j=0; i<2*n_samp; i++,j++)
Sn_[i] += sw_[j]*Pn[i];
}
int CCodec2::codec2_rand(void)
{
static unsigned long next = 1;
next = next * 1103515245 + 12345;
return((unsigned)(next/65536) % 32768);
}
/*---------------------------------------------------------------------------*\
FUNCTION....: interp_Wo()
AUTHOR......: David Rowe
DATE CREATED: 22 May 2012
Interpolates centre 10ms sample of Wo and L samples given two
samples 20ms apart. Assumes voicing is available for centre
(interpolated) frame.
\*---------------------------------------------------------------------------*/
void CCodec2::interp_Wo(
MODEL *interp, /* interpolated model params */
MODEL *prev, /* previous frames model params */
MODEL *next, /* next frames model params */
float Wo_min
)
{
interp_Wo2(interp, prev, next, 0.5, Wo_min);
}
/*---------------------------------------------------------------------------*\
FUNCTION....: interp_Wo2()
AUTHOR......: David Rowe
DATE CREATED: 22 May 2012
Weighted interpolation of two Wo samples.
\*---------------------------------------------------------------------------*/
void CCodec2::interp_Wo2(
MODEL *interp, /* interpolated model params */
MODEL *prev, /* previous frames model params */
MODEL *next, /* next frames model params */
float weight,
float Wo_min
)
{
/* trap corner case where voicing est is probably wrong */
if (interp->voiced && !prev->voiced && !next->voiced)
{
interp->voiced = 0;
}
/* Wo depends on voicing of this and adjacent frames */
if (interp->voiced)
{
if (prev->voiced && next->voiced)
interp->Wo = (1.0 - weight)*prev->Wo + weight*next->Wo;
if (!prev->voiced && next->voiced)
interp->Wo = next->Wo;
if (prev->voiced && !next->voiced)
interp->Wo = prev->Wo;
}
else
{
interp->Wo = Wo_min;
}
interp->L = PI/interp->Wo;
}
/*---------------------------------------------------------------------------*\
FUNCTION....: interp_energy()
AUTHOR......: David Rowe
DATE CREATED: 22 May 2012
Interpolates centre 10ms sample of energy given two samples 20ms
apart.
\*---------------------------------------------------------------------------*/
float CCodec2::interp_energy(float prev_e, float next_e)
{
//return powf(10.0, (log10f(prev_e) + log10f(next_e))/2.0);
return sqrtf(prev_e * next_e); //looks better is math. identical and faster math
}
/*---------------------------------------------------------------------------*\
FUNCTION....: interpolate_lsp_ver2()
AUTHOR......: David Rowe
DATE CREATED: 22 May 2012
Weighted interpolation of LSPs.
\*---------------------------------------------------------------------------*/
void CCodec2::interpolate_lsp_ver2(float interp[], float prev[], float next[], float weight, int order)
{
int i;
for(i=0; i<order; i++)
interp[i] = (1.0 - weight)*prev[i] + weight*next[i];
}
/*---------------------------------------------------------------------------*\
Introduction to Line Spectrum Pairs (LSPs)
------------------------------------------
LSPs are used to encode the LPC filter coefficients {ak} for
transmission over the channel. LSPs have several properties (like
less sensitivity to quantisation noise) that make them superior to
direct quantisation of {ak}.
A(z) is a polynomial of order lpcrdr with {ak} as the coefficients.
A(z) is transformed to P(z) and Q(z) (using a substitution and some
algebra), to obtain something like:
A(z) = 0.5[P(z)(z+z^-1) + Q(z)(z-z^-1)] (1)
As you can imagine A(z) has complex zeros all over the z-plane. P(z)
and Q(z) have the very neat property of only having zeros _on_ the
unit circle. So to find them we take a test point z=exp(jw) and
evaluate P (exp(jw)) and Q(exp(jw)) using a grid of points between 0
and pi.
The zeros (roots) of P(z) also happen to alternate, which is why we
swap coefficients as we find roots. So the process of finding the
LSP frequencies is basically finding the roots of 5th order
polynomials.
The root so P(z) and Q(z) occur in symmetrical pairs at +/-w, hence
the name Line Spectrum Pairs (LSPs).
To convert back to ak we just evaluate (1), "clocking" an impulse
thru it lpcrdr times gives us the impulse response of A(z) which is
{ak}.
\*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*\
FUNCTION....: lsp_to_lpc()
AUTHOR......: David Rowe
DATE CREATED: 24/2/93
This function converts LSP coefficients to LPC coefficients. In the
Speex code we worked out a way to simplify this significantly.
\*---------------------------------------------------------------------------*/
void CCodec2::lsp_to_lpc(float *lsp, float *ak, int order)
/* float *freq array of LSP frequencies in radians */
/* float *ak array of LPC coefficients */
/* int order order of LPC coefficients */
{
int i,j;
float xout1,xout2,xin1,xin2;
float *pw,*n1,*n2,*n3,*n4 = 0;
float freq[order];
float Wp[(order * 4) + 2];
/* convert from radians to the x=cos(w) domain */
for(i=0; i<order; i++)
freq[i] = cosf(lsp[i]);
pw = Wp;
/* initialise contents of array */
for(i=0; i<=4*(order/2)+1; i++) /* set contents of buffer to 0 */
{
*pw++ = 0.0;
}
/* Set pointers up */
pw = Wp;
xin1 = 1.0;
xin2 = 1.0;
/* reconstruct P(z) and Q(z) by cascading second order polynomials
in form 1 - 2xz(-1) +z(-2), where x is the LSP coefficient */
for(j=0; j<=order; j++)
{
for(i=0; i<(order/2); i++)
{
n1 = pw+(i*4);
n2 = n1 + 1;
n3 = n2 + 1;
n4 = n3 + 1;
xout1 = xin1 - 2*(freq[2*i]) * *n1 + *n2;
xout2 = xin2 - 2*(freq[2*i+1]) * *n3 + *n4;
*n2 = *n1;
*n4 = *n3;
*n1 = xin1;
*n3 = xin2;
xin1 = xout1;
xin2 = xout2;
}
xout1 = xin1 + *(n4+1);
xout2 = xin2 - *(n4+2);
ak[j] = (xout1 + xout2)*0.5;
*(n4+1) = xin1;
*(n4+2) = xin2;
xin1 = 0.0;
xin2 = 0.0;
}
}