|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
|
import numpy as np |
|
|
|
from nemo.collections.asr.parts.utils.vad_utils import vad_tune_threshold_on_dev |
|
from nemo.utils import logging |
|
|
|
""" |
|
This script is designed for thresholds tuning for postprocessing of VAD |
|
See details about it in nemo/collections/asr/parts/utils/vad_utils/binarization and filtering |
|
|
|
Usage: |
|
python vad_tune_threshold.py \ |
|
--onset_range="0,1,0.2" --offset_range="0,1,0.2" --min_duration_on_range="0.1,0.8,0.05" --min_duration_off_range="0.1,0.8,0.05" --not_filter_speech_first \ |
|
--vad_pred=<FULL PATH OF FOLDER OF FRAME LEVEL PREDICTION FILES> \ |
|
--groundtruth_RTTM=<DIRECTORY OF VAD PREDICTIONS OR A FILE CONTAINS THE PATHS OF THEM> \ |
|
--vad_pred_method="median" |
|
|
|
""" |
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--onset_range", help="range of onset in list 'START,END,STEP' to be tuned on", type=str) |
|
parser.add_argument("--offset_range", help="range of offset in list 'START,END,STEP' to be tuned on", type=str) |
|
parser.add_argument( |
|
"--pad_onset_range", |
|
help="range of pad_onset in list 'START,END,STEP' to be tuned on. pad_onset could be negative float", |
|
type=str, |
|
) |
|
parser.add_argument( |
|
"--pad_offset_range", |
|
help="range of pad_offset in list 'START,END,STEP' to be tuned on. pad_offset could be negative float", |
|
type=str, |
|
) |
|
|
|
parser.add_argument( |
|
"--min_duration_on_range", help="range of min_duration_on in list 'START,END,STEP' to be tuned on", type=str |
|
) |
|
parser.add_argument( |
|
"--min_duration_off_range", help="range of min_duration_off in list 'START,END,STEP' to be tuned on", type=str |
|
) |
|
parser.add_argument( |
|
"--not_filter_speech_first", |
|
help="Whether to filter short speech first during filtering, should be either True or False!", |
|
action='store_true', |
|
) |
|
|
|
parser.add_argument( |
|
"--vad_pred", help="Directory of vad predictions or a file contains the paths of them.", required=True |
|
) |
|
parser.add_argument( |
|
"--groundtruth_RTTM", |
|
help="Directory of groundtruch rttm files or a file contains the paths of them", |
|
type=str, |
|
required=True, |
|
) |
|
parser.add_argument( |
|
"--result_file", help="Filename of txt to store results", default="res", |
|
) |
|
parser.add_argument( |
|
"--vad_pred_method", |
|
help="suffix of prediction file. Should be either in 'frame', 'mean' or 'median'", |
|
required=True, |
|
) |
|
parser.add_argument( |
|
"--focus_metric", |
|
help="metrics we care most when tuning threshold. Should be either in 'DetER', 'FA', 'MISS' ", |
|
type=str, |
|
default='DetER', |
|
) |
|
parser.add_argument( |
|
"--frame_length_in_sec", help="frame_length_in_sec ", type=float, default=0.01, |
|
) |
|
args = parser.parse_args() |
|
|
|
params = {} |
|
try: |
|
|
|
if args.onset_range: |
|
start, stop, step = [float(i) for i in args.onset_range.split(",")] |
|
onset = np.arange(start, stop, step) |
|
params['onset'] = onset |
|
|
|
if args.offset_range: |
|
start, stop, step = [float(i) for i in args.offset_range.split(",")] |
|
offset = np.arange(start, stop, step) |
|
params['offset'] = offset |
|
|
|
if args.pad_onset_range: |
|
start, stop, step = [float(i) for i in args.pad_onset_range.split(",")] |
|
pad_onset = np.arange(start, stop, step) |
|
params['pad_onset'] = pad_onset |
|
|
|
if args.pad_offset_range: |
|
start, stop, step = [float(i) for i in args.pad_offset_range.split(",")] |
|
pad_offset = np.arange(start, stop, step) |
|
params['pad_offset'] = pad_offset |
|
|
|
if args.min_duration_on_range: |
|
start, stop, step = [float(i) for i in args.min_duration_on_range.split(",")] |
|
min_duration_on = np.arange(start, stop, step) |
|
params['min_duration_on'] = min_duration_on |
|
|
|
if args.min_duration_off_range: |
|
start, stop, step = [float(i) for i in args.min_duration_off_range.split(",")] |
|
min_duration_off = np.arange(start, stop, step) |
|
params['min_duration_off'] = min_duration_off |
|
|
|
if args.not_filter_speech_first: |
|
params['filter_speech_first'] = False |
|
|
|
except: |
|
raise ValueError( |
|
"Theshold input is invalid! Please enter it as a 'START,STOP,STEP' for onset, offset, min_duration_on and min_duration_off, and enter True/False for filter_speech_first" |
|
) |
|
|
|
best_threhsold, optimal_scores = vad_tune_threshold_on_dev( |
|
params, |
|
args.vad_pred, |
|
args.groundtruth_RTTM, |
|
args.result_file, |
|
args.vad_pred_method, |
|
args.focus_metric, |
|
args.frame_length_in_sec, |
|
) |
|
logging.info( |
|
f"Best combination of thresholds for binarization selected from input ranges is {best_threhsold}, and the optimal score is {optimal_scores}" |
|
) |
|
|