salmanmapkar commited on
Commit
2f47bf1
1 Parent(s): 25a24aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -33
app.py CHANGED
@@ -17,14 +17,15 @@ import moviepy.editor as mp
17
  import datetime
18
  import torch
19
  import pyannote.audio
20
- from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
21
  from pyannote.audio import Audio
22
  from pyannote.core import Segment
23
  import wave
24
  import contextlib
25
  from sklearn.cluster import AgglomerativeClustering
26
  import numpy as np
27
-
 
28
 
29
  __FILES = set()
30
 
@@ -140,7 +141,8 @@ def Transcribe_V1(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
140
 
141
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
142
  model = whisper.load_model("medium")
143
- embedding_model = PretrainedSpeakerEmbedding(
 
144
  "speechbrain/spkrec-ecapa-voxceleb",
145
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
146
  )
@@ -158,23 +160,50 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
158
  return speaker
159
 
160
  # audio = Audio()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  def get_output(segments):
162
  # print(segments)
163
  conversation=[]
164
  for (i, segment) in enumerate(segments):
165
  # print(f"{i}, {segment["speaker"]}, {segments[i - 1]["speaker"]}, {}")
166
  if not len(conversation):
167
- conversation.append([GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
168
- elif conversation[-1][0] == GetSpeaker(segment["speaker"]):
169
- conversation[-1][1] += segment["text"].lstrip()
170
  else:
171
- conversation.append([GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
172
  # if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
173
  # if i != 0:
174
  # conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
175
  # conversation[-1][1] += segment["text"][1:]
176
  # return output
177
- return ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation])), ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]})
178
 
179
  def get_duration(path):
180
  with contextlib.closing(wave.open(path,'r')) as f:
@@ -209,7 +238,9 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
209
  if duration > 4 * 60 * 60:
210
  return "Audio duration too long"
211
 
 
212
  result = model.transcribe(audio)
 
213
 
214
  segments = result["segments"]
215
 
@@ -233,7 +264,7 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
233
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
234
  if not (os.path.isfile("temp_audio.wav")):
235
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
236
- return Transcribe_V1(NumberOfSpeakers, SpeakerNames)
237
  else:
238
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
239
 
@@ -249,10 +280,10 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
249
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
250
  if not (os.path.isfile("temp_audio.wav")):
251
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
252
- return Transcribe_V1(NumberOfSpeakers, SpeakerNames)
253
  else:
254
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
255
- return Transcribe_V1(NumberOfSpeakers, SpeakerNames)
256
 
257
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
258
  if retries:
@@ -276,7 +307,7 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
276
  stream = ffmpeg.input('temp_audio.m4a')
277
  stream = ffmpeg.output(stream, 'temp_audio.wav')
278
  RemoveFile("temp_audio.m4a")
279
- return Transcribe_V1(NumberOfSpeakers, SpeakerNames)
280
  else:
281
  raise gr.Error(f"Unable to get video from {URL}")
282
 
@@ -299,27 +330,28 @@ at = gr.Interface(
299
  # demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
300
  # demo.launch()
301
  with gr.Blocks() as yav_ui:
302
- with gr.Tab("Input"):
303
- with gr.Tab("Youtube", id=1):
304
- yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
305
- yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
306
- yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
307
- ybutton_transcribe = gr.Button("Transcribe")
308
- with gr.Tab("Video", id=2):
309
- vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
310
- vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
311
- vinput = gr.Video(label="Video")
312
- vbutton_transcribe = gr.Button("Transcribe")
313
- with gr.Tab("Audio", id=3):
314
- ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
315
- ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
316
- ainput = gr.Audio(label="Audio")
317
- abutton_transcribe = gr.Button("Transcribe")
318
- with gr.Tab("Output"):
319
- with gr.Tab("Text"):
320
- output_textbox = gr.Textbox(label="Transcribed Text", lines=15)
321
- with gr.Tab("JSON"):
322
- output_json = gr.JSON(label="Transcribed JSON")
 
323
  ybutton_transcribe.click(
324
  fn=YoutubeTranscribe,
325
  inputs=[yinput_nos,yinput_sn,yinput],
 
17
  import datetime
18
  import torch
19
  import pyannote.audio
20
+ from pyannote.audio.pipelines.speaker_verification import SpeechBrainPretrainedSpeakerEmbedding #PyannoteAudioPretrainedSpeakerEmbedding
21
  from pyannote.audio import Audio
22
  from pyannote.core import Segment
23
  import wave
24
  import contextlib
25
  from sklearn.cluster import AgglomerativeClustering
26
  import numpy as np
27
+ import json
28
+ from datetime import timedelta
29
 
30
  __FILES = set()
31
 
 
141
 
142
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
143
  model = whisper.load_model("medium")
144
+ # embedding_model = SpeechBrainPretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb")
145
+ embedding_model = SpeechBrainPretrainedSpeakerEmbedding(
146
  "speechbrain/spkrec-ecapa-voxceleb",
147
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
148
  )
 
160
  return speaker
161
 
162
  # audio = Audio()
163
+ def diarization(audio):
164
+ def millisec(timeStr):
165
+ spl = timeStr.split(":")
166
+ s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
167
+ return s
168
+ as_audio = AudioSegment.from_wav(audio)
169
+ DEMO_FILE = {'uri': 'blabal', 'audio': audio}
170
+ hparams = pipeline.parameters(instantiated=True)
171
+ hparams["segmentation"]["min_duration_off"] -= 0.25
172
+ pipeline.instantiate(hparams)
173
+ if num_speakers:
174
+ dz = pipeline(DEMO_FILE, num_speakers=num_speakers)
175
+ else:
176
+ dz = pipeline(DEMO_FILE)
177
+ with open(CreateFile(f"diarization_{audio}.txt"), "w") as text_file:
178
+ text_file.write(str(dz))
179
+ dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
180
+ print(dz)
181
+ dzList = []
182
+ for l in dz:
183
+ start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
184
+ start = millisec(start)
185
+ end = millisec(end)
186
+ lex = GetSpeaker(re.findall('(SPEAKER_[0-9][0-9])', string=l)[0])
187
+ dzList.append([start, end, lex])
188
+ return dzList
189
+
190
  def get_output(segments):
191
  # print(segments)
192
  conversation=[]
193
  for (i, segment) in enumerate(segments):
194
  # print(f"{i}, {segment["speaker"]}, {segments[i - 1]["speaker"]}, {}")
195
  if not len(conversation):
196
+ conversation.append([str(timedelta(seconds=float(segment['start']))),str(timedelta(seconds=float(segment['end']))),GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
197
+ elif conversation[-1][2] == GetSpeaker(segment["speaker"]):
198
+ conversation[-1][3] += segment["text"].lstrip()
199
  else:
200
+ conversation.append([str(timedelta(seconds=float(segment['start']))),str(timedelta(seconds=float(segment['end']))),GetSpeaker(segment["speaker"]), segment["text"].lstrip()])
201
  # if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
202
  # if i != 0:
203
  # conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
204
  # conversation[-1][1] += segment["text"][1:]
205
  # return output
206
+ return ("".join([f"[{start}] - {speaker} \n{text}\n" for start, end, speaker, text in conversation])), ({ "data": [{"start": start, "end":end, "speaker": speaker, "text": text} for start, end, speaker, text in conversation]})
207
 
208
  def get_duration(path):
209
  with contextlib.closing(wave.open(path,'r')) as f:
 
238
  if duration > 4 * 60 * 60:
239
  return "Audio duration too long"
240
 
241
+ print(json.dumps(diarization(audio)))
242
  result = model.transcribe(audio)
243
+ print(json.dumps(result))
244
 
245
  segments = result["segments"]
246
 
 
264
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
265
  if not (os.path.isfile("temp_audio.wav")):
266
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
267
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
268
  else:
269
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
270
 
 
280
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
281
  if not (os.path.isfile("temp_audio.wav")):
282
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
283
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
284
  else:
285
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
286
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
287
 
288
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
289
  if retries:
 
307
  stream = ffmpeg.input('temp_audio.m4a')
308
  stream = ffmpeg.output(stream, 'temp_audio.wav')
309
  RemoveFile("temp_audio.m4a")
310
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
311
  else:
312
  raise gr.Error(f"Unable to get video from {URL}")
313
 
 
330
  # demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
331
  # demo.launch()
332
  with gr.Blocks() as yav_ui:
333
+ with gr.Row():
334
+ with gr.Column():
335
+ with gr.Tab("Youtube", id=1):
336
+ yinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
337
+ yinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
338
+ yinput = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
339
+ ybutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
340
+ with gr.Tab("Video", id=2):
341
+ vinput_nos = gr.Number(label="Number of Speakers", placeholder="2")
342
+ vinput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
343
+ vinput = gr.Video(label="Video")
344
+ vbutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
345
+ with gr.Tab("Audio", id=3):
346
+ ainput_nos = gr.Number(label="Number of Speakers", placeholder="2")
347
+ ainput_sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
348
+ ainput = gr.Audio(label="Audio", type="filepath")
349
+ abutton_transcribe = gr.Button("Transcribe", show_progress=True, scroll_to_output=True)
350
+ with gr.Column():
351
+ with gr.Tab("Text"):
352
+ output_textbox = gr.Textbox(label="Transcribed Text", lines=15)
353
+ with gr.Tab("JSON"):
354
+ output_json = gr.JSON(label="Transcribed JSON")
355
  ybutton_transcribe.click(
356
  fn=YoutubeTranscribe,
357
  inputs=[yinput_nos,yinput_sn,yinput],