File size: 7,883 Bytes
7934b29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
During inference, we perform frame-level prediction by two approaches:
1) shift the window of length window_length_in_sec (e.g. 0.63s) by shift_length_in_sec (e.g. 10ms) to generate the frame and use the prediction of the window to represent the label for the frame;
[this script demonstrate how to do this approach]
2) generate predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments.
[get frame level prediction by this script and use vad_overlap_posterior.py in NeMo/scripts/voice_activity_detection
One can also find posterior about converting frame level prediction
to speech/no-speech segment in start and end times format in that script.]
Image https://raw.githubusercontent.com/NVIDIA/NeMo/main/tutorials/asr/images/vad_post_overlap_diagram.png
will help you understand this method.
This script will also help you perform postprocessing and generate speech segments if needed
Usage:
python vad_infer.py --config-path="../conf/vad" --config-name="vad_inference_postprocessing.yaml" dataset=<Path of json file of evaluation data. Audio files should have unique names>
"""
import json
import os
import torch
from nemo.collections.asr.parts.utils.speaker_utils import write_rttm2manifest
from nemo.collections.asr.parts.utils.vad_utils import (
generate_overlap_vad_seq,
generate_vad_frame_pred,
generate_vad_segment_table,
init_vad_model,
prepare_manifest,
)
from nemo.core.config import hydra_runner
from nemo.utils import logging
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@hydra_runner(config_path="../conf/vad", config_name="vad_inference_postprocessing.yaml")
def main(cfg):
if not cfg.dataset:
raise ValueError("You must input the path of json file of evaluation data")
# each line of dataset should be have different audio_filepath and unique name to simplify edge cases or conditions
key_meta_map = {}
with open(cfg.dataset, 'r') as manifest:
for line in manifest.readlines():
audio_filepath = json.loads(line.strip())['audio_filepath']
uniq_audio_name = audio_filepath.split('/')[-1].rsplit('.', 1)[0]
if uniq_audio_name in key_meta_map:
raise ValueError("Please make sure each line is with different audio_filepath! ")
key_meta_map[uniq_audio_name] = {'audio_filepath': audio_filepath}
# Prepare manifest for streaming VAD
manifest_vad_input = cfg.dataset
if cfg.prepare_manifest.auto_split:
logging.info("Split long audio file to avoid CUDA memory issue")
logging.debug("Try smaller split_duration if you still have CUDA memory issue")
config = {
'input': manifest_vad_input,
'window_length_in_sec': cfg.vad.parameters.window_length_in_sec,
'split_duration': cfg.prepare_manifest.split_duration,
'num_workers': cfg.num_workers,
'prepared_manifest_vad_input': cfg.prepared_manifest_vad_input,
}
manifest_vad_input = prepare_manifest(config)
else:
logging.warning(
"If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it."
)
torch.set_grad_enabled(False)
vad_model = init_vad_model(cfg.vad.model_path)
# setup_test_data
vad_model.setup_test_data(
test_data_config={
'vad_stream': True,
'sample_rate': 16000,
'manifest_filepath': manifest_vad_input,
'labels': ['infer',],
'num_workers': cfg.num_workers,
'shuffle': False,
'window_length_in_sec': cfg.vad.parameters.window_length_in_sec,
'shift_length_in_sec': cfg.vad.parameters.shift_length_in_sec,
'trim_silence': False,
'normalize_audio': cfg.vad.parameters.normalize_audio,
}
)
vad_model = vad_model.to(device)
vad_model.eval()
if not os.path.exists(cfg.frame_out_dir):
os.mkdir(cfg.frame_out_dir)
else:
logging.warning(
"Note frame_out_dir exists. If new file has same name as file inside existing folder, it will append result to existing file and might cause mistakes for next steps."
)
logging.info("Generating frame level prediction ")
pred_dir = generate_vad_frame_pred(
vad_model=vad_model,
window_length_in_sec=cfg.vad.parameters.window_length_in_sec,
shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec,
manifest_vad_input=manifest_vad_input,
out_dir=cfg.frame_out_dir,
)
logging.info(
f"Finish generating VAD frame level prediction with window_length_in_sec={cfg.vad.parameters.window_length_in_sec} and shift_length_in_sec={cfg.vad.parameters.shift_length_in_sec}"
)
frame_length_in_sec = cfg.vad.parameters.shift_length_in_sec
# overlap smoothing filter
if cfg.vad.parameters.smoothing:
# Generate predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments.
# smoothing_method would be either in majority vote (median) or average (mean)
logging.info("Generating predictions with overlapping input segments")
smoothing_pred_dir = generate_overlap_vad_seq(
frame_pred_dir=pred_dir,
smoothing_method=cfg.vad.parameters.smoothing,
overlap=cfg.vad.parameters.overlap,
window_length_in_sec=cfg.vad.parameters.window_length_in_sec,
shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec,
num_workers=cfg.num_workers,
out_dir=cfg.smoothing_out_dir,
)
logging.info(
f"Finish generating predictions with overlapping input segments with smoothing_method={cfg.vad.parameters.smoothing} and overlap={cfg.vad.parameters.overlap}"
)
pred_dir = smoothing_pred_dir
frame_length_in_sec = 0.01
# postprocessing and generate speech segments
if cfg.gen_seg_table:
logging.info("Converting frame level prediction to speech/no-speech segment in start and end times format.")
table_out_dir = generate_vad_segment_table(
vad_pred_dir=pred_dir,
postprocessing_params=cfg.vad.parameters.postprocessing,
frame_length_in_sec=frame_length_in_sec,
num_workers=cfg.num_workers,
out_dir=cfg.table_out_dir,
)
logging.info(
f"Finish generating speech semgents table with postprocessing_params: {cfg.vad.parameters.postprocessing}"
)
if cfg.write_to_manifest:
for i in key_meta_map:
key_meta_map[i]['rttm_filepath'] = os.path.join(table_out_dir, i + ".txt")
if not cfg.out_manifest_filepath:
out_manifest_filepath = "vad_out.json"
else:
out_manifest_filepath = cfg.out_manifest_filepath
out_manifest_filepath = write_rttm2manifest(key_meta_map, out_manifest_filepath)
logging.info(f"Writing VAD output to manifest: {out_manifest_filepath}")
if __name__ == '__main__':
main()
|