# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import numpy as np from nemo.collections.asr.parts.utils.vad_utils import vad_tune_threshold_on_dev from nemo.utils import logging """ This script is designed for thresholds tuning for postprocessing of VAD See details about it in nemo/collections/asr/parts/utils/vad_utils/binarization and filtering Usage: python vad_tune_threshold.py \ --onset_range="0,1,0.2" --offset_range="0,1,0.2" --min_duration_on_range="0.1,0.8,0.05" --min_duration_off_range="0.1,0.8,0.05" --not_filter_speech_first \ --vad_pred= \ --groundtruth_RTTM= \ --vad_pred_method="median" """ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--onset_range", help="range of onset in list 'START,END,STEP' to be tuned on", type=str) parser.add_argument("--offset_range", help="range of offset in list 'START,END,STEP' to be tuned on", type=str) parser.add_argument( "--pad_onset_range", help="range of pad_onset in list 'START,END,STEP' to be tuned on. pad_onset could be negative float", type=str, ) parser.add_argument( "--pad_offset_range", help="range of pad_offset in list 'START,END,STEP' to be tuned on. pad_offset could be negative float", type=str, ) parser.add_argument( "--min_duration_on_range", help="range of min_duration_on in list 'START,END,STEP' to be tuned on", type=str ) parser.add_argument( "--min_duration_off_range", help="range of min_duration_off in list 'START,END,STEP' to be tuned on", type=str ) parser.add_argument( "--not_filter_speech_first", help="Whether to filter short speech first during filtering, should be either True or False!", action='store_true', ) parser.add_argument( "--vad_pred", help="Directory of vad predictions or a file contains the paths of them.", required=True ) parser.add_argument( "--groundtruth_RTTM", help="Directory of groundtruch rttm files or a file contains the paths of them", type=str, required=True, ) parser.add_argument( "--result_file", help="Filename of txt to store results", default="res", ) parser.add_argument( "--vad_pred_method", help="suffix of prediction file. Should be either in 'frame', 'mean' or 'median'", required=True, ) parser.add_argument( "--focus_metric", help="metrics we care most when tuning threshold. Should be either in 'DetER', 'FA', 'MISS' ", type=str, default='DetER', ) parser.add_argument( "--frame_length_in_sec", help="frame_length_in_sec ", type=float, default=0.01, ) args = parser.parse_args() params = {} try: # if not input range for values of parameters, use default value defined in function binarization and filtering in nemo/collections/asr/parts/utils/vad_utils.py if args.onset_range: start, stop, step = [float(i) for i in args.onset_range.split(",")] onset = np.arange(start, stop, step) params['onset'] = onset if args.offset_range: start, stop, step = [float(i) for i in args.offset_range.split(",")] offset = np.arange(start, stop, step) params['offset'] = offset if args.pad_onset_range: start, stop, step = [float(i) for i in args.pad_onset_range.split(",")] pad_onset = np.arange(start, stop, step) params['pad_onset'] = pad_onset if args.pad_offset_range: start, stop, step = [float(i) for i in args.pad_offset_range.split(",")] pad_offset = np.arange(start, stop, step) params['pad_offset'] = pad_offset if args.min_duration_on_range: start, stop, step = [float(i) for i in args.min_duration_on_range.split(",")] min_duration_on = np.arange(start, stop, step) params['min_duration_on'] = min_duration_on if args.min_duration_off_range: start, stop, step = [float(i) for i in args.min_duration_off_range.split(",")] min_duration_off = np.arange(start, stop, step) params['min_duration_off'] = min_duration_off if args.not_filter_speech_first: params['filter_speech_first'] = False except: raise ValueError( "Theshold input is invalid! Please enter it as a 'START,STOP,STEP' for onset, offset, min_duration_on and min_duration_off, and enter True/False for filter_speech_first" ) best_threhsold, optimal_scores = vad_tune_threshold_on_dev( params, args.vad_pred, args.groundtruth_RTTM, args.result_file, args.vad_pred_method, args.focus_metric, args.frame_length_in_sec, ) logging.info( f"Best combination of thresholds for binarization selected from input ranges is {best_threhsold}, and the optimal score is {optimal_scores}" )