salmanmapkar commited on
Commit
b927090
1 Parent(s): f848bd7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -35
app.py CHANGED
@@ -22,14 +22,38 @@ from pyannote.audio import Audio
22
  from pyannote.core import Segment
23
  import wave
24
  import contextlib
25
- from sklearn.cluster import KMeans
26
-
27
  import numpy as np
28
  import json
29
  from datetime import timedelta
30
 
 
 
31
  __FILES = set()
32
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def CreateFile(filename):
35
  __FILES.add(filename)
@@ -140,14 +164,16 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
140
  return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
141
 
142
 
143
- def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
144
- model = whisper.load_model("medium")
145
  # embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
 
146
  embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
147
  "speechbrain/spkrec-ecapa-voxceleb",
148
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
149
  )
150
  SPEAKER_DICT = {}
 
151
  SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
152
  def GetSpeaker(sp):
153
  speaker = sp
@@ -156,6 +182,10 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
156
  t = SPEAKERS.pop(0)
157
  SPEAKER_DICT[sp] = t
158
  speaker = SPEAKER_DICT[sp]
 
 
 
 
159
  else:
160
  speaker = SPEAKER_DICT[sp]
161
  return speaker
@@ -168,6 +198,9 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
168
  return s
169
  as_audio = AudioSegment.from_wav(audio)
170
  DEMO_FILE = {'uri': 'blabal', 'audio': audio}
 
 
 
171
  if num_speakers:
172
  dz = pipeline(DEMO_FILE, num_speakers=num_speakers)
173
  else:
@@ -201,6 +234,8 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
201
  # conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
202
  # conversation[-1][1] += segment["text"][1:]
203
  # return output
 
 
204
  return ("".join([f"[{start}] - {speaker} \n{text}\n" for start, end, speaker, text in conversation])), ({ "data": [{"start": start, "end":end, "speaker": speaker, "text": text} for start, end, speaker, text in conversation]})
205
 
206
  def get_duration(path):
@@ -224,7 +259,7 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
224
  return embedding_model(waveform[None])
225
 
226
  def add_speaker_labels(segments, embeddings, num_speakers):
227
- clustering = KMeans(num_speakers).fit(embeddings)
228
  labels = clustering.labels_
229
  for i in range(len(segments)):
230
  segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
@@ -236,9 +271,9 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
236
  if duration > 4 * 60 * 60:
237
  return "Audio duration too long"
238
 
239
- print(json.dumps(diarization(audio)))
240
  result = model.transcribe(audio)
241
- print(json.dumps(result))
242
 
243
  segments = result["segments"]
244
 
@@ -251,7 +286,7 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
251
  return get_output(segments)
252
  # return output
253
 
254
- def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
255
  print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
256
  if retries:
257
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
@@ -262,11 +297,11 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
262
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
263
  if not (os.path.isfile("temp_audio.wav")):
264
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
265
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
266
  else:
267
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
268
 
269
- def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
270
  if retries:
271
  try:
272
  clip = mp.VideoFileClip(video)
@@ -278,12 +313,11 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
278
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
279
  if not (os.path.isfile("temp_audio.wav")):
280
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
281
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
282
  else:
283
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
284
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
285
 
286
- def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
287
  if retries:
288
  if "youtu" not in URL.lower():
289
  raise gr.Error(f"{URL} is not a valid youtube URL.")
@@ -305,42 +339,28 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
305
  stream = ffmpeg.input('temp_audio.m4a')
306
  stream = ffmpeg.output(stream, 'temp_audio.wav')
307
  RemoveFile("temp_audio.m4a")
308
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
309
  else:
310
  raise gr.Error(f"Unable to get video from {URL}")
311
 
312
- ut = gr.Interface(
313
- fn=YoutubeTranscribe,
314
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
315
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
316
- )
317
- vt = gr.Interface(
318
- fn=VideoTranscribe,
319
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
320
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
321
- )
322
- at = gr.Interface(
323
- fn=AudioTranscribe,
324
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
325
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
326
- )
327
 
328
- # demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
329
- # demo.launch()
330
  with gr.Blocks() as yav_ui:
331
  with gr.Row():
332
  with gr.Column():
333
  with gr.Tab("Youtube", id=1):
 
334
  yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
335
  yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
336
  yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
337
  ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
338
  with gr.Tab("Video", id=2):
 
339
  vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
340
  vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
341
  vinput = gr.Video(label="Video")
342
  vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
343
  with gr.Tab("Audio", id=3):
 
344
  ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
345
  ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
346
  ainput = gr.Audio(label="Audio", type="filepath")
@@ -352,17 +372,17 @@ with gr.Blocks() as yav_ui:
352
  output_json = gr.JSON(label="Transcribed JSON")
353
  ybutton_transcribe.click(
354
  fn=YoutubeTranscribe,
355
- inputs=[yinput_nos,yinput_sn,yinput],
356
  outputs=[output_textbox,output_json]
357
  )
358
  abutton_transcribe.click(
359
  fn=AudioTranscribe,
360
- inputs=[ainput_nos,ainput_sn,ainput],
361
  outputs=[output_textbox,output_json]
362
  )
363
  vbutton_transcribe.click(
364
  fn=VideoTranscribe,
365
- inputs=[vinput_nos,vinput_sn,vinput],
366
  outputs=[output_textbox,output_json]
367
  )
368
  yav_ui.launch(debug=True)
 
22
  from pyannote.core import Segment
23
  import wave
24
  import contextlib
25
+ from sklearn.cluster import AgglomerativeClustering
 
26
  import numpy as np
27
  import json
28
  from datetime import timedelta
29
 
30
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
31
+
32
  __FILES = set()
33
+ wispher_models = list(whisper._MODELS.keys())
34
+
35
+ def correct_grammar(input_text,num_return_sequences=1):
36
+ torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
37
+ tokenizer = T5Tokenizer.from_pretrained('deep-learning-analytics/GrammarCorrector')
38
+ model = T5ForConditionalGeneration.from_pretrained('deep-learning-analytics/GrammarCorrector').to(torch_device)
39
+ batch = tokenizer([input_text],truncation=True,padding='max_length',max_length=len(input_text), return_tensors="pt").to(torch_device)
40
+ results = model.generate(**batch,max_length=len(input_text),num_beams=2, num_return_sequences=num_return_sequences, temperature=1.5)
41
+ generated_sequences = []
42
+ for generated_sequence_idx, generated_sequence in enumerate(results):
43
+ text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
44
+ generated_sequences.append(text)
45
+ generated_text = "".join(generated_sequences)
46
+ _generated_text = ""
47
+ for idx, _sentence in enumerate(generated_text.split('.'), 0):
48
+ if not idx:
49
+ _generated_text+=_sentence+'.'
50
+ elif _sentence[:1]!=' ':
51
+ _generated_text+=' '+_sentence+'.'
52
+ elif _sentence[:1]=='':
53
+ pass
54
+ else:
55
+ _generated_text+=_sentence+'.'
56
+ return _generated_text
57
 
58
  def CreateFile(filename):
59
  __FILES.add(filename)
 
164
  return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
165
 
166
 
167
+ def Transcribe_V2(model, num_speakers, speaker_names, audio="temp_audio.wav"):
168
+ model = whisper.load_model(model)
169
  # embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
170
+
171
  embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
172
  "speechbrain/spkrec-ecapa-voxceleb",
173
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
174
  )
175
  SPEAKER_DICT = {}
176
+ default_speaker_names = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
177
  SPEAKERS = [speaker.strip() for speaker in speaker_names.split(',') if len(speaker)]
178
  def GetSpeaker(sp):
179
  speaker = sp
 
182
  t = SPEAKERS.pop(0)
183
  SPEAKER_DICT[sp] = t
184
  speaker = SPEAKER_DICT[sp]
185
+ elif len(default_speaker_names):
186
+ t = default_speaker_names.pop(0)
187
+ SPEAKER_DICT[sp] = t
188
+ speaker = SPEAKER_DICT[sp]
189
  else:
190
  speaker = SPEAKER_DICT[sp]
191
  return speaker
 
198
  return s
199
  as_audio = AudioSegment.from_wav(audio)
200
  DEMO_FILE = {'uri': 'blabal', 'audio': audio}
201
+ hparams = pipeline.parameters(instantiated=True)
202
+ hparams["segmentation"]["min_duration_off"] -= 0.25
203
+ pipeline.instantiate(hparams)
204
  if num_speakers:
205
  dz = pipeline(DEMO_FILE, num_speakers=num_speakers)
206
  else:
 
234
  # conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
235
  # conversation[-1][1] += segment["text"][1:]
236
  # return output
237
+ for idx in range(len(conversation)):
238
+ conversation[idx][3] = correct_grammar(conversation[idx][3])
239
  return ("".join([f"[{start}] - {speaker} \n{text}\n" for start, end, speaker, text in conversation])), ({ "data": [{"start": start, "end":end, "speaker": speaker, "text": text} for start, end, speaker, text in conversation]})
240
 
241
  def get_duration(path):
 
259
  return embedding_model(waveform[None])
260
 
261
  def add_speaker_labels(segments, embeddings, num_speakers):
262
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
263
  labels = clustering.labels_
264
  for i in range(len(segments)):
265
  segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
 
271
  if duration > 4 * 60 * 60:
272
  return "Audio duration too long"
273
 
274
+ # print(json.dumps(diarization(audio)))
275
  result = model.transcribe(audio)
276
+ # print(json.dumps(result))
277
 
278
  segments = result["segments"]
279
 
 
286
  return get_output(segments)
287
  # return output
288
 
289
+ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5, model='base'):
290
  print(f"{NumberOfSpeakers}, {SpeakerNames}, {retries}")
291
  if retries:
292
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
 
297
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
298
  if not (os.path.isfile("temp_audio.wav")):
299
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
300
+ return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
301
  else:
302
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
303
 
304
+ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5, model='base'):
305
  if retries:
306
  try:
307
  clip = mp.VideoFileClip(video)
 
313
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
314
  if not (os.path.isfile("temp_audio.wav")):
315
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
316
+ return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
317
  else:
318
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
 
319
 
320
+ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5, model='base'):
321
  if retries:
322
  if "youtu" not in URL.lower():
323
  raise gr.Error(f"{URL} is not a valid youtube URL.")
 
339
  stream = ffmpeg.input('temp_audio.m4a')
340
  stream = ffmpeg.output(stream, 'temp_audio.wav')
341
  RemoveFile("temp_audio.m4a")
342
+ return Transcribe_V2(model, NumberOfSpeakers, SpeakerNames)
343
  else:
344
  raise gr.Error(f"Unable to get video from {URL}")
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
 
 
347
  with gr.Blocks() as yav_ui:
348
  with gr.Row():
349
  with gr.Column():
350
  with gr.Tab("Youtube", id=1):
351
+ ysz = gr.Dropdown(label="Model Size", choices=wispher_models , value='base')
352
  yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
353
  yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
354
  yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
355
  ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
356
  with gr.Tab("Video", id=2):
357
+ vsz = gr.Dropdown(label="Model Size", choices=wispher_models, value='base')
358
  vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
359
  vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
360
  vinput = gr.Video(label="Video")
361
  vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
362
  with gr.Tab("Audio", id=3):
363
+ asz = gr.Dropdown(label="Model Size", choices=wispher_models , value='base')
364
  ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
365
  ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
366
  ainput = gr.Audio(label="Audio", type="filepath")
 
372
  output_json = gr.JSON(label="Transcribed JSON")
373
  ybutton_transcribe.click(
374
  fn=YoutubeTranscribe,
375
+ inputs=[yinput_nos,yinput_sn,yinput, ysz],
376
  outputs=[output_textbox,output_json]
377
  )
378
  abutton_transcribe.click(
379
  fn=AudioTranscribe,
380
+ inputs=[ainput_nos,ainput_sn,ainput, asz],
381
  outputs=[output_textbox,output_json]
382
  )
383
  vbutton_transcribe.click(
384
  fn=VideoTranscribe,
385
+ inputs=[vinput_nos,vinput_sn,vinput, vsz],
386
  outputs=[output_textbox,output_json]
387
  )
388
  yav_ui.launch(debug=True)