Spaces:

peb-peb
/

shravan

Runtime error

App Files Files Community

peb-peb commited on Jul 2, 2023

Commit

218b27d

•

1 Parent(s): a65e425

add UI interface

Browse files

Files changed (1) hide show

app.py +138 -90

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import whisper
 import gradio as gr
 import datetime
@@ -6,96 +6,144 @@ import subprocess
 import wave
 import contextlib
-import torch
-import pyannote.audio
-from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
-from pyannote.audio import Audio
-from pyannote.core import Segment
-from sklearn.cluster import AgglomerativeClustering
-import numpy as np
-model = whisper.load_model("large-v2")
-embedding_model = PretrainedSpeakerEmbedding(
-    "speechbrain/spkrec-ecapa-voxceleb",
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-)
-def transcribe(audio, num_speakers):
-  path, error = convert_to_wav(audio)
-  if error is not None:
-    return error
-  duration = get_duration(path)
-  if duration > 4 * 60 * 60:
-    return "Audio duration too long"
-  result = model.transcribe(path)
-  segments = result["segments"]
-  num_speakers = min(max(round(num_speakers), 1), len(segments))
-  if len(segments) == 1:
-    segments[0]['speaker'] = 'SPEAKER 1'
-  else:
-    embeddings = make_embeddings(path, segments, duration)
-    add_speaker_labels(segments, embeddings, num_speakers)
-  output = get_output(segments)
-  return output
-def convert_to_wav(path):
-  if path[-3:] != 'wav':
-    new_path = '.'.join(path.split('.')[:-1]) + '.wav'
-    try:
-      subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
-    except:
-      return path, 'Error: Could not convert file to .wav'
-    path = new_path
-  return path, None
-def get_duration(path):
-  with contextlib.closing(wave.open(path,'r')) as f:
-    frames = f.getnframes()
-    rate = f.getframerate()
-    return frames / float(rate)
-def make_embeddings(path, segments, duration):
-  embeddings = np.zeros(shape=(len(segments), 192))
-  for i, segment in enumerate(segments):
-    embeddings[i] = segment_embedding(path, segment, duration)
-  return np.nan_to_num(embeddings)
-audio = Audio()
-def segment_embedding(path, segment, duration):
-  start = segment["start"]
-  # Whisper overshoots the end timestamp in the last segment
-  end = min(duration, segment["end"])
-  clip = Segment(start, end)
-  waveform, sample_rate = audio.crop(path, clip)
-  return embedding_model(waveform[None])
-def add_speaker_labels(segments, embeddings, num_speakers):
-  clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
-  labels = clustering.labels_
-  for i in range(len(segments)):
-    segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
-def time(secs):
-  return datetime.timedelta(seconds=round(secs))
-def get_output(segments):
-  output = ''
-  for (i, segment) in enumerate(segments):
-    if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
-      if i != 0:
-        output += '\n\n'
-      output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
-    output += segment["text"][1:] + ' '
-  return output
 with gr.Blocks() as demo:
-  with gr.Box():
-    gr.Textbox(label="First")
-    gr.Textbox(label="Last")
 demo.launch()

+# import whisper
 import gradio as gr
 import datetime
 import wave
 import contextlib
+# import torch
+# import pyannote.audio
+# from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
+# from pyannote.audio import Audio
+# from pyannote.core import Segment
+# from sklearn.cluster import AgglomerativeClustering
+# import numpy as np
+# model = whisper.load_model("large-v2")
+# embedding_model = PretrainedSpeakerEmbedding(
+#     "speechbrain/spkrec-ecapa-voxceleb",
+#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# )
+# def transcribe(audio, num_speakers):
+#   path, error = convert_to_wav(audio)
+#   if error is not None:
+#     return error
+#   duration = get_duration(path)
+#   if duration > 4 * 60 * 60:
+#     return "Audio duration too long"
+#   result = model.transcribe(path)
+#   segments = result["segments"]
+#   num_speakers = min(max(round(num_speakers), 1), len(segments))
+#   if len(segments) == 1:
+#     segments[0]['speaker'] = 'SPEAKER 1'
+#   else:
+#     embeddings = make_embeddings(path, segments, duration)
+#     add_speaker_labels(segments, embeddings, num_speakers)
+#   output = get_output(segments)
+#   return output
+# def convert_to_wav(path):
+#   if path[-3:] != 'wav':
+#     new_path = '.'.join(path.split('.')[:-1]) + '.wav'
+#     try:
+#       subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
+#     except:
+#       return path, 'Error: Could not convert file to .wav'
+#     path = new_path
+#   return path, None
+# def get_duration(path):
+#   with contextlib.closing(wave.open(path,'r')) as f:
+#     frames = f.getnframes()
+#     rate = f.getframerate()
+#     return frames / float(rate)
+# def make_embeddings(path, segments, duration):
+#   embeddings = np.zeros(shape=(len(segments), 192))
+#   for i, segment in enumerate(segments):
+#     embeddings[i] = segment_embedding(path, segment, duration)
+#   return np.nan_to_num(embeddings)
+# audio = Audio()
+# def segment_embedding(path, segment, duration):
+#   start = segment["start"]
+#   # Whisper overshoots the end timestamp in the last segment
+#   end = min(duration, segment["end"])
+#   clip = Segment(start, end)
+#   waveform, sample_rate = audio.crop(path, clip)
+#   return embedding_model(waveform[None])
+# def add_speaker_labels(segments, embeddings, num_speakers):
+#   clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
+#   labels = clustering.labels_
+#   for i in range(len(segments)):
+#     segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
+# def time(secs):
+#   return datetime.timedelta(seconds=round(secs))
+# def get_output(segments):
+#   output = ''
+#   for (i, segment) in enumerate(segments):
+#     if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
+#       if i != 0:
+#         output += '\n\n'
+#       output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
+#     output += segment["text"][1:] + ' '
+#   return output
+s = ""
+def greet1(name):
+    global s
+    s = "modified"
+    return "Hello " + name + "!"
+def greet2(name):
+    return "Hi " + name + "!" + " " + s
+def greet3(name):
+    return "Hola " + name + "!"
 with gr.Blocks() as demo:
+  with gr.Row():
+    with gr.Column():
+      audio_file = gr.UploadButton(label="Upload a Audio file (.wav)")
+      # name = gr.Textbox(label="Name", placeholder="Name") # TODO: remove
+      number_of_speakers = gr.Number(label="Number of Speakers", value=2)
+      with gr.Row():
+        btn_clear = gr.Button(value="Clear")
+        btn_submit = gr.Button(value="Submit")
+    with gr.Column():
+      title = gr.Textbox(label="Title", placeholder="Title for Conversation")
+      short_summary = gr.Textbox(label="Short Summary", placeholder="Short Summary for Conversation")
+      sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation")
+      quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation")
+      detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation")
+  gr.Markdown("## Examples")
+  gr.Examples(
+    examples=[
+      [
+        "Harsh",
+        2,
+      ],
+      [
+        "Rahul",
+        2,
+      ],
+    ],
+    inputs=[title],
+    outputs=[short_summary],
+    fn=greet1,
+    cache_examples=True,
+  )
+  gr.Markdown(
+    """
+    See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
+    for more details.
+    """
+  )
 demo.launch()