Spaces:
Running
Running
/* | |
Copyright 2015 Google Inc. All rights reserved. | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
*/ | |
const char* kHelp = "Usage: <bin> -i <input_file> " | |
"[-f <f0_output> -p <pitchmarks_output> \\\n" | |
"-c <correlations_output> " | |
"-t " | |
"-s " | |
"-e <float> " | |
"-x <float> " | |
"-m <float> \\\n" | |
"-u <float> " | |
"-w <float> " | |
"-a " | |
"-d <debug_output_basename>] " | |
"\n\n Help:\n" | |
"-t enables a Hilbert transform that may reduce phase distortion\n" | |
"-s suppress applying high pass filter at 80Hz " | |
"(rumble-removal highpass filter)\n" | |
"-e specifies the output frame interval for F0\n" | |
"-x maximum f0 to look for\n" | |
"-m minimum f0 to look for\n" | |
"-u regular inter-mark interval to use in UV pitchmark regions\n" | |
"-w set the cost for unvoiced segments\n" | |
" (def. 0.9, the higher the value the more f0 estimates in noise)\n" | |
"-a saves F0 and PM output in ascii mode\n" | |
"-d write diagnostic output to this file pattern\n" | |
"\nOutputs:\n"\ | |
"The output files specified with -f and -p are Edinburgh Speech Tool\n" | |
"(EST) style files. These are output as binary by default, ASCII with\n" | |
"the -a option. The F0 values in the f0_output file are resampled with\n" | |
"the frame interval specified by option -e (default .005 s).\n" | |
"The unvoiced regions of the pitchmark file are filled with marks spaced\n" | |
"by the interval specified with -u (default .01 s).\n" | |
"\nIt is strongly recommended that the default high-pass filter be\n" | |
"applied to all input signals to remove DC and low-frequency\n" | |
"components. For signals that have been recorded using close-talking\n" | |
"microphones, or those that have been subjected to various other\n" | |
"non-linear phase distortions in the studio, or in post-production, it\n" | |
"is often helpful to apply a Hilbert transform (-t option). The .resid\n" | |
"file output when -d is specified can be examined to determine if the\n" | |
"voice pulses look anything like the classical glottal-flow derivative,\n" | |
"and the Hilbert transform enabled, or not.\n" | |
"\n" | |
"In the discussion below, the following notation is used:\n" | |
"Fs: sample rate\n" | |
"Fs0: sample rate of input file\n" | |
"nd: n-dimensional vector signal\n" | |
"ts: time-stamped vector signal; 1st ele. is the time of the sample in sec.\n" | |
"float, int16, etc.: atomic type of the data in the file\n" | |
"\nIf -d <name>is specified, the following raw binary output files are produced,\n" | |
"with the base path as specified in <name>, and the indicated extensions:\n" | |
".bestcorr 2d float ts the highest NCC value found at each residual peak\n" | |
".bprms 1d float Fs=500 RMS of 100-1000 Hz bandpassed input signal\n" | |
".f0ap 3d float ts F0 and NCC value found for each period\n" | |
".resid 1d float Fs=Fs0 LPC residual of conditioned input signal\n" | |
".nresid 1d float Fs=Fs0 LPC residual with local gain normalization\n" | |
".offsetp 1d float Fs=500 pseudo-probability that voicing is terminating\n" | |
".onsetp 1d float Fs=500 pseudo-probability that voicing is starting\n" | |
".pvoiced 1d float Fs=500 pseudo-probability that voicing is occurring\n" | |
".pcm 1d float Fs=Fs0 conditioned input signal\n" | |
".pmlab ASCII 'xlabel' format file of epoch marks\n" | |
".pvals 1d float Fs=Fs0 graded residual peak candidates\n"; | |
Track* MakeEpochOutput(EpochTracker &et, float unvoiced_pm_interval) { | |
std::vector<float> times; | |
std::vector<int16_t> voicing; | |
et.GetFilledEpochs(unvoiced_pm_interval, ×, &voicing); | |
Track* pm_track = new Track; | |
pm_track->resize(times.size()); | |
for (int32_t i = 0; i < times.size(); ++i) { | |
pm_track->t(i) = times[i]; | |
pm_track->set_v(i, voicing[i]); | |
} | |
return pm_track; | |
} | |
Track* MakeF0Output(EpochTracker &et, float resample_interval, Track** cor) { | |
std::vector<float> f0; | |
std::vector<float> corr; | |
if (!et.ResampleAndReturnResults(resample_interval, &f0, &corr)) { | |
return NULL; | |
} | |
Track* f0_track = new Track; | |
Track* cor_track = new Track; | |
f0_track->resize(f0.size()); | |
cor_track->resize(corr.size()); | |
for (int32_t i = 0; i < f0.size(); ++i) { | |
float t = resample_interval * i; | |
f0_track->t(i) = t; | |
cor_track->t(i) = t; | |
f0_track->set_v(i, (f0[i] > 0.0) ? true : false); | |
cor_track->set_v(i, (f0[i] > 0.0) ? true : false); | |
f0_track->a(i) = (f0[i] > 0.0) ? f0[i] : -1.0; | |
cor_track->a(i) = corr[i]; | |
} | |
*cor = cor_track; | |
return f0_track; | |
} | |
bool ComputeEpochsAndF0(EpochTracker &et, float unvoiced_pulse_interval, | |
float external_frame_interval, | |
Track** pm, Track** f0, Track** corr) { | |
if (!et.ComputeFeatures()) { | |
return false; | |
} | |
bool tr_result = et.TrackEpochs(); | |
et.WriteDiagnostics(""); // Try to save them here, even after tracking failure. | |
if (!tr_result) { | |
fprintf(stderr, "Problems in TrackEpochs"); | |
return false; | |
} | |
// create pm and f0 objects, these need to be freed in calling client. | |
*pm = MakeEpochOutput(et, unvoiced_pulse_interval); | |
*f0 = MakeF0Output(et, external_frame_interval, corr); | |
return true; | |
} | |
int main(int argc, char* argv[]) { | |
int opt = 0; | |
std::string filename; | |
std::string f0_output; | |
std::string pm_output; | |
std::string corr_output; | |
bool do_hilbert_transform = kDoHilbertTransform; | |
bool do_high_pass = kDoHighpass; | |
float external_frame_interval = kExternalFrameInterval; | |
float max_f0 = kMaxF0Search; | |
float min_f0 = kMinF0Search; | |
float inter_pulse = kUnvoicedPulseInterval; | |
float unvoiced_cost = kUnvoicedCost; | |
bool ascii = false; | |
std::string debug_output; | |
if (argc < 3) { | |
fprintf(stdout, "\n%s\n", kHelp); | |
return 1; | |
} | |
while ((opt = getopt(argc, argv, "i:f:p:c:htse:x:m:u:w:ad:")) != -1) { | |
switch(opt) { | |
case 'i': | |
filename = optarg; | |
break; | |
case 'f': | |
f0_output = optarg; | |
break; | |
case 'p': | |
pm_output = optarg; | |
break; | |
case 'c': | |
corr_output = optarg; | |
break; | |
case 't': | |
do_hilbert_transform = true; | |
break; | |
case 's': | |
do_high_pass = false; | |
break; | |
case 'e': | |
external_frame_interval = atof(optarg); | |
break; | |
case 'x': | |
max_f0 = atof(optarg); | |
break; | |
case 'm': | |
min_f0 = atof(optarg); | |
break; | |
case 'u': | |
inter_pulse = atof(optarg); | |
break; | |
case 'w': | |
unvoiced_cost = atof(optarg); | |
break; | |
case 'a': | |
ascii = true; | |
break; | |
case 'd': | |
debug_output = optarg; | |
break; | |
case 'h': | |
fprintf(stdout, "\n%s\n", kHelp); | |
return 0; | |
} | |
} | |
// Load input. | |
Wave wav; | |
if (!wav.Load(filename)) { | |
fprintf(stderr, "Failed to load waveform '%s'\n", filename.c_str()); | |
return 1; | |
} | |
EpochTracker et; | |
et.set_unvoiced_cost(unvoiced_cost); | |
int16_t* wave_datap = const_cast<int16_t *>(wav.data()->data()); | |
int32_t n_samples = wav.num_samples(); | |
float sample_rate = wav.sample_rate(); | |
if (!et.Init(wave_datap, n_samples, sample_rate, | |
min_f0, max_f0, do_high_pass, do_hilbert_transform)) { | |
return 1; | |
} | |
if (!debug_output.empty()) { | |
et.set_debug_name(debug_output); | |
} | |
// Compute f0 and pitchmarks. | |
Track *f0 = NULL; | |
Track *pm = NULL; | |
Track *corr = NULL; | |
if (!ComputeEpochsAndF0(et, inter_pulse, external_frame_interval, &pm, &f0, &corr)) { | |
fprintf(stderr, "Failed to compute epochs\n"); | |
return 1; | |
} | |
// Save outputs. | |
if (!f0_output.empty() && !f0->Save(f0_output, ascii)) { | |
delete f0; | |
fprintf(stderr, "Failed to save f0 to '%s'\n", f0_output.c_str()); | |
return 1; | |
} | |
if (!pm_output.empty() && !pm->Save(pm_output, ascii)) { | |
delete pm; | |
fprintf(stderr, "Failed to save pitchmarks to '%s'\n", pm_output.c_str()); | |
return 1; | |
} | |
if (!corr_output.empty() && !corr->Save(corr_output, ascii)) { | |
delete corr; | |
fprintf(stderr, "Failed to save correlations to '%s'\n", corr_output.c_str()); | |
return 1; | |
} | |
delete f0; | |
delete pm; | |
delete corr; | |
return 0; | |
} | |