import gradio as gr import numpy as np from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio import torch probs = None audio_length_samples = None def process_audio(audio_input): global probs global audio_length_samples wav = read_audio(audio_input, sampling_rate=16_000) audio_length_samples = len(wav) probs = get_speech_probs(wav, sampling_rate=16_000) return make_visualization(probs, 512 / 16_000) def process_parameters(threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms): print(probs) timestamps = probs2speech_timestamps(probs, audio_length_samples, threshold = threshold, min_speech_duration_ms = min_speech_duration_ms, min_silence_duration_ms=min_silence_duration_ms, window_size_samples=window_size_samples, speech_pad_ms=speech_pad_ms) print(timestamps) return timestamps def main(): with gr.Blocks() as demo: with gr.Row(): audio_input = gr.Audio(type="filepath") button1 = gr.Button("Compute Probabilities") figure = gr.Plot() button1.click(process_audio, inputs=[audio_input], outputs=figure) with gr.Row(): threshold = gr.Number(label="Threshold", value=0.5, minimum=0.0, maximum=1.0) min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", value=250) min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100) window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536) speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30) button2 = gr.Button("Process Parameters") output_text = gr.Textbox() button2.click(process_parameters, inputs=[threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms], outputs=output_text) demo.launch() if __name__ == "__main__": main()