Files changed (2) hide show
  1. app.py +35 -67
  2. requirements.txt +7 -9
app.py CHANGED
@@ -1,5 +1,4 @@
1
- # import whisper
2
- from faster_whisper import WhisperModel
3
  import datetime
4
  import subprocess
5
  import gradio as gr
@@ -13,7 +12,6 @@ from sklearn.cluster import AgglomerativeClustering
13
  from sklearn.metrics import silhouette_score
14
 
15
  from pytube import YouTube
16
- import yt_dlp
17
  import torch
18
  import pyannote.audio
19
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
@@ -27,7 +25,7 @@ import contextlib
27
  from transformers import pipeline
28
  import psutil
29
 
30
- whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
31
  source_languages = {
32
  "en": "English",
33
  "zh": "Chinese",
@@ -175,48 +173,26 @@ def _return_yt_html_embed(yt_url):
175
  return HTML_str
176
 
177
  def yt_transcribe(yt_url):
178
- # yt = YouTube(yt_url)
179
- # html_embed_str = _return_yt_html_embed(yt_url)
180
- # stream = yt.streams.filter(only_audio=True)[0]
181
- # stream.download(filename="audio.mp3")
182
-
183
- ydl_opts = {
184
- 'format': 'bestvideo*+bestaudio/best',
185
- 'postprocessors': [{
186
- 'key': 'FFmpegExtractAudio',
187
- 'preferredcodec': 'mp3',
188
- 'preferredquality': '192',
189
- }],
190
- 'outtmpl':'audio.%(ext)s',
191
- }
192
-
193
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
194
- ydl.download([yt_url])
195
-
196
  text = pipe("audio.mp3")["text"]
 
197
  return html_embed_str, text
198
 
199
  def convert_time(secs):
200
  return datetime.timedelta(seconds=round(secs))
201
 
202
  def get_youtube(video_url):
203
- # yt = YouTube(video_url)
204
- # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
205
-
206
- ydl_opts = {
207
- 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
208
- }
209
-
210
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
211
- info = ydl.extract_info(video_url, download=False)
212
- abs_video_path = ydl.prepare_filename(info)
213
- ydl.process_info(info)
214
-
215
  print("Success download video")
216
  print(abs_video_path)
217
  return abs_video_path
218
 
219
- def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
220
  """
221
  # Transcribe youtube link using OpenAI Whisper
222
  1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
@@ -227,9 +203,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
227
  Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
228
  """
229
 
230
- # model = whisper.load_model(whisper_model)
231
- # model = WhisperModel(whisper_model, device="cuda", compute_type="int8_float16")
232
- model = WhisperModel(whisper_model, compute_type="int8")
233
  time_start = time.time()
234
  if(video_file_path == None):
235
  raise ValueError("Error no video input")
@@ -253,19 +227,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
253
  # Transcribe audio
254
  options = dict(language=selected_source_lang, beam_size=5, best_of=5)
255
  transcribe_options = dict(task="transcribe", **options)
256
- segments_raw, info = model.transcribe(audio_file, **transcribe_options)
257
-
258
- # Convert back to original openai format
259
- segments = []
260
- i = 0
261
- for segment_chunk in segments_raw:
262
- chunk = {}
263
- chunk["start"] = segment_chunk.start
264
- chunk["end"] = segment_chunk.end
265
- chunk["text"] = segment_chunk.text
266
- segments.append(chunk)
267
- i += 1
268
- print("transcribe audio done with fast whisper")
269
  except Exception as e:
270
  raise RuntimeError("Error converting video to audio")
271
 
@@ -286,19 +250,22 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
286
  embeddings = np.nan_to_num(embeddings)
287
  print(f'Embedding shape: {embeddings.shape}')
288
 
289
- if num_speakers == 0:
290
  # Find the best number of speakers
291
- score_num_speakers = {}
292
-
293
- for num_speakers in range(2, 10+1):
294
- clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
295
- score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
296
- score_num_speakers[num_speakers] = score
297
- best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
298
- print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
299
  else:
300
- best_num_speaker = num_speakers
301
-
 
 
 
 
 
 
 
 
 
302
  # Assign speaker label
303
  clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
304
  labels = clustering.labels_
@@ -353,7 +320,8 @@ df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
353
  memory = psutil.virtual_memory()
354
  selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
355
  selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
356
- number_speakers = gr.Number(precision=0, value=0, label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers", interactive=True)
 
357
  system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
358
  download_transcript = gr.File(label="Download transcript")
359
  transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
@@ -367,8 +335,7 @@ with demo:
367
  gr.Markdown('''
368
  <div>
369
  <h1 style='text-align: center'>Whisper speaker diarization</h1>
370
- This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
371
- and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
372
  </div>
373
  ''')
374
 
@@ -411,10 +378,11 @@ with demo:
411
  ''')
412
  selected_source_lang.render()
413
  selected_whisper_model.render()
414
- number_speakers.render()
 
415
  transcribe_btn = gr.Button("Transcribe audio and diarization")
416
  transcribe_btn.click(speech_to_text,
417
- [video_in, selected_source_lang, selected_whisper_model, number_speakers],
418
  [transcription_df, system_info, download_transcript]
419
  )
420
 
 
1
+ import whisper
 
2
  import datetime
3
  import subprocess
4
  import gradio as gr
 
12
  from sklearn.metrics import silhouette_score
13
 
14
  from pytube import YouTube
 
15
  import torch
16
  import pyannote.audio
17
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
 
25
  from transformers import pipeline
26
  import psutil
27
 
28
+ whisper_models = ["base", "small", "medium", "large"]
29
  source_languages = {
30
  "en": "English",
31
  "zh": "Chinese",
 
173
  return HTML_str
174
 
175
  def yt_transcribe(yt_url):
176
+ yt = YouTube(yt_url)
177
+ html_embed_str = _return_yt_html_embed(yt_url)
178
+ stream = yt.streams.filter(only_audio=True)[0]
179
+ stream.download(filename="audio.mp3")
180
+
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  text = pipe("audio.mp3")["text"]
182
+
183
  return html_embed_str, text
184
 
185
  def convert_time(secs):
186
  return datetime.timedelta(seconds=round(secs))
187
 
188
  def get_youtube(video_url):
189
+ yt = YouTube(video_url)
190
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
 
 
 
 
 
 
 
 
 
 
191
  print("Success download video")
192
  print(abs_video_path)
193
  return abs_video_path
194
 
195
+ def speech_to_text(video_file_path, selected_source_lang, whisper_model, min_num_speakers, max_number_speakers):
196
  """
197
  # Transcribe youtube link using OpenAI Whisper
198
  1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
 
203
  Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
204
  """
205
 
206
+ model = whisper.load_model(whisper_model)
 
 
207
  time_start = time.time()
208
  if(video_file_path == None):
209
  raise ValueError("Error no video input")
 
227
  # Transcribe audio
228
  options = dict(language=selected_source_lang, beam_size=5, best_of=5)
229
  transcribe_options = dict(task="transcribe", **options)
230
+ result = model.transcribe(audio_file, **transcribe_options)
231
+ segments = result["segments"]
232
+ print("starting whisper done with whisper")
 
 
 
 
 
 
 
 
 
 
233
  except Exception as e:
234
  raise RuntimeError("Error converting video to audio")
235
 
 
250
  embeddings = np.nan_to_num(embeddings)
251
  print(f'Embedding shape: {embeddings.shape}')
252
 
 
253
  # Find the best number of speakers
254
+ if min_num_speakers > max_number_speakers:
255
+ min_speakers = max_number_speakers
256
+ max_speakers = min_num_speakers
 
 
 
 
 
257
  else:
258
+ min_speakers = min_num_speakers
259
+ max_speakers = max_number_speakers
260
+ score_num_speakers = {}
261
+
262
+ for num_speakers in range(min_speakers, max_speakers+1):
263
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
264
+ score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
265
+ score_num_speakers[num_speakers] = score
266
+ best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
267
+ print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
268
+
269
  # Assign speaker label
270
  clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
271
  labels = clustering.labels_
 
320
  memory = psutil.virtual_memory()
321
  selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
322
  selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
323
+ input_min_number_speakers = gr.Number(precision=0, value=2, label="Select minimum number of speakers", interactive=True)
324
+ input_max_number_speakers = gr.Number(precision=0, value=2, label="Select maximum number of speakers", interactive=True)
325
  system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
326
  download_transcript = gr.File(label="Download transcript")
327
  transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
 
335
  gr.Markdown('''
336
  <div>
337
  <h1 style='text-align: center'>Whisper speaker diarization</h1>
338
+ This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
 
339
  </div>
340
  ''')
341
 
 
378
  ''')
379
  selected_source_lang.render()
380
  selected_whisper_model.render()
381
+ input_min_number_speakers.render()
382
+ input_max_number_speakers.render()
383
  transcribe_btn = gr.Button("Transcribe audio and diarization")
384
  transcribe_btn.click(speech_to_text,
385
+ [video_in, selected_source_lang, selected_whisper_model, input_min_number_speakers, input_max_number_speakers],
386
  [transcription_df, system_info, download_transcript]
387
  )
388
 
requirements.txt CHANGED
@@ -1,22 +1,20 @@
1
  git+https://github.com/huggingface/transformers
2
  git+https://github.com/pyannote/pyannote-audio
3
  git+https://github.com/openai/whisper.git
4
- gradio
5
  ffmpeg-python
6
- pandas
7
- pytube
8
  sacremoses
9
  sentencepiece
10
  tokenizers
11
  torch
12
  torchaudio
13
- tqdm
14
- EasyNMT
15
  nltk
16
  transformers
17
  pysrt
18
- psutil
19
  requests
20
- gpuinfo
21
- faster-whisper
22
- yt-dlp
 
1
  git+https://github.com/huggingface/transformers
2
  git+https://github.com/pyannote/pyannote-audio
3
  git+https://github.com/openai/whisper.git
4
+ gradio==3.12
5
  ffmpeg-python
6
+ pandas==1.5.0
7
+ pytube==12.1.0
8
  sacremoses
9
  sentencepiece
10
  tokenizers
11
  torch
12
  torchaudio
13
+ tqdm==4.64.1
14
+ EasyNMT==2.0.2
15
  nltk
16
  transformers
17
  pysrt
18
+ psutil==5.9.2
19
  requests
20
+ gpuinfo