import gradio as gr import ast model = gr.Interface.load("huggingface/pyannote/voice-activity-detection") def format_inference(output): if output: timestamps = [] for out in output: timestamps.append(f"Start: {out['start']}s; Stop: {out['stop']}s") return "\n".join(timestamps) else: return "No voice activity detected." def inference(audio_file): output = model(audio_file) output_list = ast.literal_eval(output) return format_inference(output_list) inputs = gr.inputs.Audio(label="Input Audio", type="filepath", source="upload") outputs = gr.outputs.Textbox(label="Voice timestamps", type="auto") title = "Voice Activity Detection" description = "

Upload an audio file and detected voices will be timestamped.

" article = "

Model by pyannote, https://github.com/pyannote/pyannote-audio

" examples = [["talk.wav"], ["talk2.wav"], ["silence.wav"],] gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples, theme="grass", allow_flagging=False, ).launch(debug=True)