salmanmapkar commited on
Commit
e3f4780
1 Parent(s): f30cc6a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -10
app.py CHANGED
@@ -13,7 +13,7 @@ import traceback
13
  import json
14
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
15
  from pydub.effects import speedup
16
- import moviepy.editor as mp
17
  import datetime
18
  import torch
19
  import pyannote.audio
@@ -25,12 +25,6 @@ import contextlib
25
  from sklearn.cluster import AgglomerativeClustering
26
  import numpy as np
27
 
28
- model = whisper.load_model("medium")
29
- embedding_model = PretrainedSpeakerEmbedding(
30
- "speechbrain/spkrec-ecapa-voxceleb",
31
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
32
- )
33
-
34
 
35
  __FILES = set()
36
 
@@ -145,8 +139,13 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
145
 
146
 
147
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
 
 
 
 
 
148
  SPEAKER_DICT = {}
149
- SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',')]
150
  def GetSpeaker(sp):
151
  speaker = sp
152
  if sp not in list(SPEAKER_DICT.keys()):
@@ -224,6 +223,7 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
224
  # return output
225
 
226
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
 
227
  if retries:
228
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
229
  try:
@@ -296,5 +296,43 @@ at = gr.Interface(
296
  outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
297
  )
298
 
299
- demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
300
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  import json
14
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
15
  from pydub.effects import speedup
16
+ # import moviepy.editor as mp
17
  import datetime
18
  import torch
19
  import pyannote.audio
 
25
  from sklearn.cluster import AgglomerativeClustering
26
  import numpy as np
27
 
 
 
 
 
 
 
28
 
29
  __FILES = set()
30
 
 
139
 
140
 
141
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
142
+ model = whisper.load_model("medium")
143
+ embedding_model = PretrainedSpeakerEmbedding(
144
+ "speechbrain/spkrec-ecapa-voxceleb",
145
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
146
+ )
147
  SPEAKER_DICT = {}
148
+ SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
149
  def GetSpeaker(sp):
150
  speaker = sp
151
  if sp not in list(SPEAKER_DICT.keys()):
 
223
  # return output
224
 
225
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
226
+ print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
227
  if retries:
228
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
229
  try:
 
296
  outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
297
  )
298
 
299
+ # demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
300
+ # demo.launch()
301
+ with gr.Blocks() as yav_ui:
302
+ with gr.Tab("Input"):
303
+ with gr.Tab("Youtube", id=1):
304
+ yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
305
+ yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
306
+ yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
307
+ ybutton_transcribe = gr.Button("Transcribe")
308
+ with gr.Tab("Video", id=2):
309
+ vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
310
+ vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
311
+ vinput = gr.Video(label="Video")
312
+ vbutton_transcribe = gr.Button("Transcribe")
313
+ with gr.Tab("Audio", id=3):
314
+ ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
315
+ ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
316
+ ainput = gr.Audio(label="Audio")
317
+ abutton_transcribe = gr.Button("Transcribe")
318
+ with gr.Tab("Output"):
319
+ with gr.Tab("Text"):
320
+ output_textbox = gr.Textbox(label="Transcribed Text", lines=15)
321
+ with gr.Tab("JSON"):
322
+ output_json = gr.JSON(label="Transcribed JSON")
323
+ ybutton_transcribe.click(
324
+ fn=YoutubeTranscribe,
325
+ inputs=[yinput_nos,yinput_sn,yinput],
326
+ outputs=[output_textbox,output_json]
327
+ )
328
+ abutton_transcribe.click(
329
+ fn=AudioTranscribe,
330
+ inputs=[ainput_nos,ainput_sn,ainput],
331
+ outputs=[output_textbox,output_json]
332
+ )
333
+ vbutton_transcribe.click(
334
+ fn=VideoTranscribe,
335
+ inputs=[vinput_nos,vinput_sn,vinput],
336
+ outputs=[output_textbox,output_json]
337
+ )
338
+ yav_ui.launch(debug=True)