# import whisper
import gradio as gr
import datetime

import subprocess
import wave
import contextlib

# import torch
# import pyannote.audio
# from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
# from pyannote.audio import Audio
# from pyannote.core import Segment
# from sklearn.cluster import AgglomerativeClustering
# import numpy as np

# model = whisper.load_model("large-v2")
# embedding_model = PretrainedSpeakerEmbedding( 
#     "speechbrain/spkrec-ecapa-voxceleb",
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# )

# def transcribe(audio, num_speakers):
#   path, error = convert_to_wav(audio)
#   if error is not None:
#     return error

#   duration = get_duration(path)
#   if duration > 4 * 60 * 60:
#     return "Audio duration too long"

#   result = model.transcribe(path)
#   segments = result["segments"]

#   num_speakers = min(max(round(num_speakers), 1), len(segments))
#   if len(segments) == 1:
#     segments[0]['speaker'] = 'SPEAKER 1'
#   else:
#     embeddings = make_embeddings(path, segments, duration)
#     add_speaker_labels(segments, embeddings, num_speakers)
#   output = get_output(segments)
#   return output

# def convert_to_wav(path):
#   if path[-3:] != 'wav':
#     new_path = '.'.join(path.split('.')[:-1]) + '.wav'
#     try:
#       subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
#     except:
#       return path, 'Error: Could not convert file to .wav'
#     path = new_path
#   return path, None

# def get_duration(path):
#   with contextlib.closing(wave.open(path,'r')) as f:
#     frames = f.getnframes()
#     rate = f.getframerate()
#     return frames / float(rate)

# def make_embeddings(path, segments, duration):
#   embeddings = np.zeros(shape=(len(segments), 192))
#   for i, segment in enumerate(segments):
#     embeddings[i] = segment_embedding(path, segment, duration)
#   return np.nan_to_num(embeddings)

# audio = Audio()

# def segment_embedding(path, segment, duration):
#   start = segment["start"]
#   # Whisper overshoots the end timestamp in the last segment
#   end = min(duration, segment["end"])
#   clip = Segment(start, end)
#   waveform, sample_rate = audio.crop(path, clip)
#   return embedding_model(waveform[None])

# def add_speaker_labels(segments, embeddings, num_speakers):
#   clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
#   labels = clustering.labels_
#   for i in range(len(segments)):
#     segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

# def time(secs):
#   return datetime.timedelta(seconds=round(secs))

# def get_output(segments):
#   output = ''
#   for (i, segment) in enumerate(segments):
#     if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
#       if i != 0:
#         output += '\n\n'
#       output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
#     output += segment["text"][1:] + ' '
#   return output

s = ""

def greet1(name):
    global s
    s = "modified"
    return "Hello " + name + "!"


def greet2(name):
    return "Hi " + name + "!" + " " + s


def greet3(name):
    return "Hola " + name + "!"

with gr.Blocks() as demo:
  with gr.Box():
    with gr.Row():
      with gr.Column():
        audio_file = gr.File(label="Upload a Audio file (.wav)", file_count=1)
        # name = gr.Textbox(label="Name", placeholder="Name") # TODO: remove
        number_of_speakers = gr.Number(label="Number of Speakers", value=2)
        with gr.Row():
          btn_clear = gr.Button(value="Clear")
          btn_submit = gr.Button(value="Submit")
      with gr.Column():
        title = gr.Textbox(label="Title", placeholder="Title for Conversation")
        short_summary = gr.Textbox(label="Short Summary", placeholder="Short Summary for Conversation")
        sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation")
        quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation")
        detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation")
    gr.Markdown("## Examples")
    gr.Examples( 
      examples=[
        [
          "Harsh",
          2,
        ],
        [
          "Rahul",
          2,
        ],
      ],
      inputs=[title],
      outputs=[short_summary],
      fn=greet1,
      cache_examples=True,
    )
  gr.Markdown(
    """
    See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
    for more details.
    """
  )

demo.launch()