from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from nemo.collections.asr.models import EncDecRNNTBPEModel
from nemo.collections.asr.models import EncDecSpeakerLabelModel
import gradio as gr
import pandas as pd 
import torch
import json
from omegaconf import OmegaConf
import uuid

device = "cuda" if torch.cuda.is_available() else "cpu"

model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
model.eval()

def run_diarization(path1):
    annotation = model(path1, num_workers=0, batch_size=16)
    rttm=annotation.to_rttm()
    df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
    lines = rttm.splitlines()
    if len(lines) == 0:
        df.loc[0] = 0, 0, 'No speaker found'
        return df
    start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
    end_time = float(start_time) + float(duration)
    df.loc[0] = start_time, end_time, prev_speaker, ''

    for line in lines[1:]:
        split = line.split()
        start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
        end_time = float(start_time) + float(duration)
        if cur_speaker == prev_speaker:
            df.loc[df.index[-1], 'end_time'] = end_time
        else:
            df.loc[len(df)] = start_time, end_time, cur_speaker, ''
        prev_speaker = cur_speaker
    
    hyp = get_transcripts(df, path1)

    assert len(hyp) == len(df)

    for i in range(len(df)):
        df.loc[i, 'text'] = hyp[i]

    return df

def create_manifest(df,audio_path):

    filename = '/tmp/' + str(uuid.uuid4()) + '.json'
    with open(filename, 'w') as f:
        for i in range(len(df)):
            start_time = df.iloc[i]['start_time']
            end_time = df.iloc[i]['end_time']
            speaker = df.iloc[i]['speaker']
            dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
            json.dump(dic, f)
            f.write('\n')

    return filename

def get_transcripts(df, audio_path):
    
    filename = create_manifest(df,audio_path)
    model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
    model.eval()
    config = OmegaConf.create({"manifest_filepath": filename, 'batch_size': 2})
    dataloader = model._setup_transcribe_dataloader(config)
    
    hypotheses = []
    all_hypotheses = []

    for test_batch in (dataloader):
        encoded, encoded_len = model.forward(
            input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
            )
        best_hyp, all_hyp = model.decoding.rnnt_decoder_predictions_tensor(
            encoded,
            encoded_len,
            return_hypotheses=False,
            partial_hypotheses=None,)

        hypotheses += best_hyp
        if all_hyp is not None:
            all_hypotheses += all_hyp
        else:
            all_hypotheses += best_hyp

        del encoded
        del test_batch

    return hypotheses

article = (
    "<p style='text-align: center'>"
    "<a href='https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic' target='_blank'>🎙️ Learn more about MSDD model</a> | "
    "<a href='https://arxiv.org/abs/2203.15974' target='_blank'>📚 MSDD paper</a> | "
    "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>🧑‍💻 Repository</a>"
    "</p>"
)
examples = [
    ["data/conversation.wav"],
    ["data/id10270_5r0dWxy17C8-00001.wav"],
]

microphone_interface = gr.Interface(
    fn=run_diarization,
    inputs=[gr.Audio(source="microphone", type="filepath", label="Mic Audio")],
    outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
        row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
    title="Offline Speaker Diarization with NeMo",
    description="This demonstration will perform offline speaker diarization on an audio file using nemo",
    article=article,
    layout="vertical",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
    )   

upload_interface = gr.Interface(
    fn=run_diarization,
    inputs=[gr.Audio(source="upload", type='filepath', label='Upload File')],
    outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
        row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
    title="Offline Speaker Diarization with NeMo",
    description="This demonstration will perform offline speaker diarization on an audio file using nemo",
    article=article,
    layout="vertical",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
    )

demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])

demo.launch(enable_queue=True)