salmanmapkar commited on
Commit
4832cf7
1 Parent(s): 0278cb8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -23
app.py CHANGED
@@ -13,6 +13,22 @@ import json
13
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
14
  from pydub.effects import speedup
15
  import moviepy.editor as mp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  __FILES = set()
@@ -131,6 +147,67 @@ def Transcribe(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
131
  RemoveAllFiles()
132
  return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
135
  if retries:
136
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
@@ -141,7 +218,7 @@ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5)
141
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
142
  if not (os.path.isfile("temp_audio.wav")):
143
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
144
- return Transcribe(NumberOfSpeakers, SpeakerNames)
145
  else:
146
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
147
 
@@ -157,10 +234,10 @@ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5)
157
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
158
  if not (os.path.isfile("temp_audio.wav")):
159
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
160
- return Transcribe(NumberOfSpeakers, SpeakerNames)
161
  else:
162
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
163
- return Transcribe(NumberOfSpeakers, SpeakerNames)
164
 
165
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
166
  if retries:
@@ -184,27 +261,10 @@ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries =
184
  stream = ffmpeg.input('temp_audio.m4a')
185
  stream = ffmpeg.output(stream, 'temp_audio.wav')
186
  RemoveFile("temp_audio.m4a")
187
- return Transcribe(NumberOfSpeakers, SpeakerNames)
188
  else:
189
  raise gr.Error(f"Unable to get video from {URL}")
190
-
191
- with gr.Blocks() as _block_ut:
192
- ftxt, fjsonl, fcsv = True, False, False
193
- def output_selection(_ftxt, _fjsonl, _fcsv):
194
- global ftxt, fjsonl, fcsv
195
- ftxt = _ftxt
196
- fjsonl = _fjsonl
197
- fcsv = _fcsv
198
- with gr.Row():
199
- nos = gr.Number(label="Number of Speakers", value="0")
200
- sn = gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2")
201
- url = gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w")
202
- ocb = gr.CheckboxGroup(["Text", "JSONL", "CSV"])
203
- bt = gr.Button(fn=output_selection)
204
- with gr.Column():
205
- output_txt = gr.Textbox(label="Transcribed Text", lines=15, visible = ftxt)
206
- output_jsonl = gr.JSON(label="Transcribed Text", visible = fjsonl)
207
-
208
  ut = gr.Interface(
209
  fn=YoutubeTranscribe,
210
  inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
@@ -221,5 +281,5 @@ at = gr.Interface(
221
  outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
222
  )
223
 
224
- demo = gr.TabbedInterface([_block_ut, vt, at], ["Youtube URL", "Video", "Audio"])
225
  demo.launch()
 
13
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
14
  from pydub.effects import speedup
15
  import moviepy.editor as mp
16
+ import datetime
17
+ import torch
18
+ import pyannote.audio
19
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
20
+ from pyannote.audio import Audio
21
+ from pyannote.core import Segment
22
+ import wave
23
+ import contextlib
24
+ from sklearn.cluster import AgglomerativeClustering
25
+ import numpy as np
26
+
27
+ model = whisper.load_model("medium")
28
+ embedding_model = PretrainedSpeakerEmbedding(
29
+ "speechbrain/spkrec-ecapa-voxceleb",
30
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
+ )
32
 
33
 
34
  __FILES = set()
 
147
  RemoveAllFiles()
148
  return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
149
 
150
+
151
+ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
152
+ audio = Audio()
153
+ GenerateSpeakerDict(speaker_names)
154
+ def get_output(segments):
155
+ # print(segments)
156
+ output = ''
157
+ for (i, segment) in enumerate(segments):
158
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
159
+ if i != 0:
160
+ conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
161
+ conversation[-1][1] += segment["text"][1:]
162
+ # return output
163
+ return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
164
+
165
+ def get_duration(path):
166
+ with contextlib.closing(wave.open(path,'r')) as f:
167
+ frames = f.getnframes()
168
+ rate = f.getframerate()
169
+ return frames / float(rate)
170
+
171
+ def make_embeddings(path, segments, duration):
172
+ embeddings = np.zeros(shape=(len(segments), 192))
173
+ for i, segment in enumerate(segments):
174
+ embeddings[i] = segment_embedding(path, segment, duration)
175
+ return np.nan_to_num(embeddings)
176
+
177
+ def segment_embedding(path, segment, duration):
178
+ start = segment["start"]
179
+ # Whisper overshoots the end timestamp in the last segment
180
+ end = min(duration, segment["end"])
181
+ clip = Segment(start, end)
182
+ waveform, sample_rate = audio.crop(path, clip)
183
+ return embedding_model(waveform[None])
184
+
185
+ def add_speaker_labels(segments, embeddings, num_speakers):
186
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
187
+ labels = clustering.labels_
188
+ for i in range(len(segments)):
189
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
190
+
191
+ def time(secs):
192
+ return datetime.timedelta(seconds=round(secs))
193
+
194
+ duration = get_duration(audio)
195
+ if duration > 4 * 60 * 60:
196
+ return "Audio duration too long"
197
+
198
+ result = model.transcribe(audio)
199
+
200
+ segments = result["segments"]
201
+
202
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
203
+ if len(segments) == 1:
204
+ segments[0]['speaker'] = 'SPEAKER 1'
205
+ else:
206
+ embeddings = make_embeddings(audio, segments, duration)
207
+ add_speaker_labels(segments, embeddings, num_speakers)
208
+ return get_output(segments)
209
+ # return output
210
+
211
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
212
  if retries:
213
  # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
 
218
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
219
  if not (os.path.isfile("temp_audio.wav")):
220
  return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
221
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
222
  else:
223
  raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
224
 
 
234
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
235
  if not (os.path.isfile("temp_audio.wav")):
236
  return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
237
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
238
  else:
239
  raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
240
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
241
 
242
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
243
  if retries:
 
261
  stream = ffmpeg.input('temp_audio.m4a')
262
  stream = ffmpeg.output(stream, 'temp_audio.wav')
263
  RemoveFile("temp_audio.m4a")
264
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
265
  else:
266
  raise gr.Error(f"Unable to get video from {URL}")
267
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  ut = gr.Interface(
269
  fn=YoutubeTranscribe,
270
  inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
 
281
  outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
282
  )
283
 
284
+ demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
285
  demo.launch()