salmanmapkar commited on
Commit
eb43d71
1 Parent(s): 502d612

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -12
app.py CHANGED
@@ -1,5 +1,6 @@
1
  from __future__ import unicode_literals
2
  import youtube_dl
 
3
  from pydub import AudioSegment
4
  from pyannote.audio import Pipeline
5
  import re
@@ -24,12 +25,6 @@ import contextlib
24
  from sklearn.cluster import AgglomerativeClustering
25
  import numpy as np
26
 
27
- model = whisper.load_model("medium")
28
- embedding_model = PretrainedSpeakerEmbedding(
29
- "speechbrain/spkrec-ecapa-voxceleb",
30
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
- )
32
-
33
 
34
  __FILES = set()
35
 
@@ -49,7 +44,7 @@ def RemoveAllFiles():
49
 
50
  def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
51
  SPEAKER_DICT = {}
52
- SPEAKERS = [speaker.strip() for speaker in SpeakerNames.split(',')]
53
 
54
  def GetSpeaker(sp):
55
  speaker = sp
@@ -109,7 +104,7 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
109
  return f"dz_{audio}.wav", dzList, segments
110
 
111
  def transcribe(dz_audio):
112
- model = whisper.load_model("large")
113
  result = model.transcribe(dz_audio)
114
  # for _ in result['segments']:
115
  # print(_['start'], _['end'], _['text'])
@@ -144,8 +139,13 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
144
 
145
 
146
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
 
 
 
 
 
147
  SPEAKER_DICT = {}
148
- SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',')]
149
  def GetSpeaker(sp):
150
  speaker = sp
151
  if sp not in list(SPEAKER_DICT.keys()):
@@ -223,6 +223,7 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
223
  # return output
224
 
225
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
 
226
  if retries:
227
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
228
  try:
@@ -268,7 +269,7 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
268
  }],
269
  }
270
  try:
271
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
272
  ydl.download([URL])
273
  except:
274
  return YoutubeTranscribe(NumberOfSpeakers, SpeakerNames, URL, retries-1)
@@ -295,5 +296,43 @@ at = gr.Interface(
295
  outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
296
  )
297
 
298
- demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
299
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import unicode_literals
2
  import youtube_dl
3
+ import yt_dlp
4
  from pydub import AudioSegment
5
  from pyannote.audio import Pipeline
6
  import re
 
25
  from sklearn.cluster import AgglomerativeClustering
26
  import numpy as np
27
 
 
 
 
 
 
 
28
 
29
  __FILES = set()
30
 
 
44
 
45
  def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
46
  SPEAKER_DICT = {}
47
+ SPEAKERS = [speaker.strip() for speaker in SpeakerNames.split(',') if len(speaker)]
48
 
49
  def GetSpeaker(sp):
50
  speaker = sp
 
104
  return f"dz_{audio}.wav", dzList, segments
105
 
106
  def transcribe(dz_audio):
107
+ model = whisper.load_model("medium")
108
  result = model.transcribe(dz_audio)
109
  # for _ in result['segments']:
110
  # print(_['start'], _['end'], _['text'])
 
139
 
140
 
141
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
142
+ model = whisper.load_model("medium")
143
+ embedding_model = PretrainedSpeakerEmbedding(
144
+ "speechbrain/spkrec-ecapa-voxceleb",
145
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
146
+ )
147
  SPEAKER_DICT = {}
148
+ SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
149
  def GetSpeaker(sp):
150
  speaker = sp
151
  if sp not in list(SPEAKER_DICT.keys()):
 
223
  # return output
224
 
225
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
226
+ print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
227
  if retries:
228
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
229
  try:
 
269
  }],
270
  }
271
  try:
272
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
273
  ydl.download([URL])
274
  except:
275
  return YoutubeTranscribe(NumberOfSpeakers, SpeakerNames, URL, retries-1)
 
296
  outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
297
  )
298
 
299
+ # demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
300
+ # demo.launch()
301
+ with gr.Blocks() as yav_ui:
302
+ with gr.Tab("Input"):
303
+ with gr.Tab("Youtube", id=1):
304
+ yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
305
+ yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
306
+ yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
307
+ ybutton_transcribe = gr.Button("Transcribe")
308
+ with gr.Tab("Video", id=2):
309
+ vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
310
+ vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
311
+ vinput = gr.Video(label="Video")
312
+ vbutton_transcribe = gr.Button("Transcribe")
313
+ with gr.Tab("Audio", id=3):
314
+ ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
315
+ ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
316
+ ainput = gr.Audio(label="Audio")
317
+ abutton_transcribe = gr.Button("Transcribe")
318
+ with gr.Tab("Output"):
319
+ with gr.Tab("Text"):
320
+ output_textbox = gr.Textbox(label="Transcribed Text", lines=15)
321
+ with gr.Tab("JSON"):
322
+ output_json = gr.JSON(label="Transcribed JSON")
323
+ ybutton_transcribe.click(
324
+ fn=YoutubeTranscribe,
325
+ inputs=[yinput_nos,yinput_sn,yinput],
326
+ outputs=[output_textbox,output_json]
327
+ )
328
+ abutton_transcribe.click(
329
+ fn=AudioTranscribe,
330
+ inputs=[ainput_nos,ainput_sn,ainput],
331
+ outputs=[output_textbox,output_json]
332
+ )
333
+ vbutton_transcribe.click(
334
+ fn=VideoTranscribe,
335
+ inputs=[vinput_nos,vinput_sn,vinput],
336
+ outputs=[output_textbox,output_json]
337
+ )
338
+ yav_ui.launch(debug=True)