from nemo.collections.asr.models import NeuralDiarizer from nemo.collections.asr.parts.utils.speaker_utils import rttm_to_labels import gradio as gr import torch import pandas as pd device = "cuda" if torch.cuda.is_available() else "cpu" model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device) def run_diarization(path1): annotation = model(path1) rttm=annotation.to_rttm() df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker']) for idx,line in enumerate(rttm.splitlines()): split = line.split() start_time, end_time, speaker = split[3], split[4], split[7] df.loc[idx] = start_time, end_time, speaker return df inputs = [ gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Input Audio"), ] output = gr.outputs.Dataframe() description = ( "This demonstration will perform offline speaker diarization on an audio file using nemo" ) article = ( "

" "🎙️ Learn more about TitaNet model | " "📚 TitaNet paper | " "🧑‍💻 Repository" "

" ) examples = [ "data/id10270_5r0dWxy17C8-00001.wav", ] interface = gr.Interface( fn=run_diarization, inputs=inputs, outputs=output, title="Offline Speaker Diarization with NeMo", description=description, article=article, layout="horizontal", theme="huggingface", allow_flagging=False, live=False, examples=examples, ) interface.launch(enable_queue=True)