from nemo.collections.asr.models.msdd_models import NeuralDiarizer from nemo.collections.asr.models import EncDecRNNTBPEModel from nemo.collections.asr.models import EncDecSpeakerLabelModel import gradio as gr import pandas as pd import torch import json from omegaconf import OmegaConf import uuid device = "cuda" if torch.cuda.is_available() else "cpu" model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device) speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device) model.eval() def run_diarization(path1): annotation = model(path1, num_workers=0, batch_size=16) rttm=annotation.to_rttm() df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text']) lines = rttm.splitlines() if len(lines) == 0: df.loc[0] = 0, 0, 'No speaker found' return df start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7] end_time = float(start_time) + float(duration) df.loc[0] = start_time, end_time, prev_speaker, '' for line in lines[1:]: split = line.split() start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7] end_time = float(start_time) + float(duration) if cur_speaker == prev_speaker: df.loc[df.index[-1], 'end_time'] = end_time else: df.loc[len(df)] = start_time, end_time, cur_speaker, '' prev_speaker = cur_speaker hyp = get_transcripts(df, path1) assert len(hyp) == len(df) for i in range(len(df)): df.loc[i, 'text'] = hyp[i] return df def create_manifest(df,audio_path): filename = '/tmp/' + str(uuid.uuid4()) + '.json' with open(filename, 'w') as f: for i in range(len(df)): start_time = df.iloc[i]['start_time'] end_time = df.iloc[i]['end_time'] speaker = df.iloc[i]['speaker'] dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time} json.dump(dic, f) f.write('\n') return filename def get_transcripts(df, audio_path): filename = create_manifest(df,audio_path) model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device) model.eval() config = OmegaConf.create({"manifest_filepath": filename, 'batch_size': 2}) dataloader = model._setup_transcribe_dataloader(config) hypotheses = [] all_hypotheses = [] for test_batch in (dataloader): encoded, encoded_len = model.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) ) best_hyp, all_hyp = model.decoding.rnnt_decoder_predictions_tensor( encoded, encoded_len, return_hypotheses=False, partial_hypotheses=None,) hypotheses += best_hyp if all_hyp is not None: all_hypotheses += all_hyp else: all_hypotheses += best_hyp del encoded del test_batch return hypotheses article = ( "

" "🎙️ Learn more about MSDD model | " "📚 MSDD paper | " "🧑‍💻 Repository" "

" ) examples = [ ["data/conversation.wav"], ["data/id10270_5r0dWxy17C8-00001.wav"], ] microphone_interface = gr.Interface( fn=run_diarization, inputs=[gr.Audio(source="microphone", type="filepath", label="Mic Audio")], outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition', row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])], title="Offline Speaker Diarization with NeMo", description="This demonstration will perform offline speaker diarization on an audio file using nemo", article=article, layout="vertical", theme="huggingface", allow_flagging=False, live=False, examples=examples, ) upload_interface = gr.Interface( fn=run_diarization, inputs=[gr.Audio(source="upload", type='filepath', label='Upload File')], outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition', row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])], title="Offline Speaker Diarization with NeMo", description="This demonstration will perform offline speaker diarization on an audio file using nemo", article=article, layout="vertical", theme="huggingface", allow_flagging=False, live=False, examples=examples, ) demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"]) demo.launch(enable_queue=True)