salmanmapkar commited on
Commit
8d881b7
1 Parent(s): 2f47bf1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -27
app.py CHANGED
@@ -28,7 +28,7 @@ import json
28
  from datetime import timedelta
29
 
30
  __FILES = set()
31
-
32
 
33
  def CreateFile(filename):
34
  __FILES.add(filename)
@@ -139,14 +139,16 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
139
  return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
140
 
141
 
142
- def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
143
- model = whisper.load_model("medium")
144
  # embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
 
145
  embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
146
  "speechbrain/spkrec-ecapa-voxceleb",
147
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
148
  )
149
  SPEAKER_DICT = {}
 
150
  SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
151
  def GetSpeaker(sp):
152
  speaker = sp
@@ -155,6 +157,10 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
155
  t = SPEAKERS.pop(0)
156
  SPEAKER_DICT[sp] = t
157
  speaker = SPEAKER_DICT[sp]
 
 
 
 
158
  else:
159
  speaker = SPEAKER_DICT[sp]
160
  return speaker
@@ -253,7 +259,7 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
253
  return get_output(segments)
254
  # return output
255
 
256
- def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
257
  print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
258
  if retries:
259
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
@@ -264,11 +270,11 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
264
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
265
  if not (os.path.isfile("temp_audio.wav")):
266
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
267
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
268
  else:
269
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
270
 
271
- def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
272
  if retries:
273
  try:
274
  clip = mp.VideoFileClip(video)
@@ -280,12 +286,11 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
280
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
281
  if not (os.path.isfile("temp_audio.wav")):
282
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
283
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
284
  else:
285
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
286
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
287
 
288
- def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
289
  if retries:
290
  if "youtu" not in URL.lower():
291
  raise gr.Error(f"{URL} is not a valid youtube URL.")
@@ -307,42 +312,28 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
307
  stream = ffmpeg.input('temp_audio.m4a')
308
  stream = ffmpeg.output(stream, 'temp_audio.wav')
309
  RemoveFile("temp_audio.m4a")
310
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
311
  else:
312
  raise gr.Error(f"Unable to get video from {URL}")
313
 
314
- ut = gr.Interface(
315
- fn=YoutubeTranscribe,
316
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
317
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
318
- )
319
- vt = gr.Interface(
320
- fn=VideoTranscribe,
321
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
322
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
323
- )
324
- at = gr.Interface(
325
- fn=AudioTranscribe,
326
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
327
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
328
- )
329
 
330
- # demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
331
- # demo.launch()
332
  with gr.Blocks() as yav_ui:
333
  with gr.Row():
334
  with gr.Column():
335
  with gr.Tab("Youtube", id=1):
 
336
  yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
337
  yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
338
  yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
339
  ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
340
  with gr.Tab("Video", id=2):
 
341
  vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
342
  vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
343
  vinput = gr.Video(label="Video")
344
  vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
345
  with gr.Tab("Audio", id=3):
 
346
  ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
347
  ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
348
  ainput = gr.Audio(label="Audio", type="filepath")
 
28
  from datetime import timedelta
29
 
30
  __FILES = set()
31
+ wispher_models = ist(whisper._MODELS.keys())
32
 
33
  def CreateFile(filename):
34
  __FILES.add(filename)
 
139
  return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
140
 
141
 
142
+ def Transcribe_V2(model, num_speakers, speaker_names, audio="temp_audio.wav"):
143
+ #model = whisper.load_model("medium")
144
  # embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
145
+
146
  embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
147
  "speechbrain/spkrec-ecapa-voxceleb",
148
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
149
  )
150
  SPEAKER_DICT = {}
151
+ default_speaker_names = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
152
  SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
153
  def GetSpeaker(sp):
154
  speaker = sp
 
157
  t = SPEAKERS.pop(0)
158
  SPEAKER_DICT[sp] = t
159
  speaker = SPEAKER_DICT[sp]
160
+ elif len(default_speaker_names):
161
+ t = default_speaker_names.pop(0)
162
+ SPEAKER_DICT[sp] = t
163
+ speaker = SPEAKER_DICT[sp]
164
  else:
165
  speaker = SPEAKER_DICT[sp]
166
  return speaker
 
259
  return get_output(segments)
260
  # return output
261
 
262
+ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5, model='base'):
263
  print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
264
  if retries:
265
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
 
270
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
271
  if not (os.path.isfile("temp_audio.wav")):
272
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
273
+ return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
274
  else:
275
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
276
 
277
+ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5, model='base'):
278
  if retries:
279
  try:
280
  clip = mp.VideoFileClip(video)
 
286
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
287
  if not (os.path.isfile("temp_audio.wav")):
288
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
289
+ return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
290
  else:
291
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
 
292
 
293
+ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5, model='base'):
294
  if retries:
295
  if "youtu" not in URL.lower():
296
  raise gr.Error(f"{URL} is not a valid youtube URL.")
 
312
  stream = ffmpeg.input('temp_audio.m4a')
313
  stream = ffmpeg.output(stream, 'temp_audio.wav')
314
  RemoveFile("temp_audio.m4a")
315
+ return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
316
  else:
317
  raise gr.Error(f"Unable to get video from {URL}")
318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
 
 
320
  with gr.Blocks() as yav_ui:
321
  with gr.Row():
322
  with gr.Column():
323
  with gr.Tab("Youtube", id=1):
324
+ ysz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
325
  yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
326
  yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
327
  yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
328
  ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
329
  with gr.Tab("Video", id=2):
330
+ vsz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
331
  vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
332
  vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
333
  vinput = gr.Video(label="Video")
334
  vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
335
  with gr.Tab("Audio", id=3):
336
+ asz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
337
  ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
338
  ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
339
  ainput = gr.Audio(label="Audio", type="filepath")