vumichien commited on
Commit
1ce609e
1 Parent(s): 494edc1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -8
app.py CHANGED
@@ -1,4 +1,5 @@
1
- import whisper
 
2
  import datetime
3
  import subprocess
4
  import gradio as gr
@@ -25,7 +26,7 @@ import contextlib
25
  from transformers import pipeline
26
  import psutil
27
 
28
- whisper_models = ["base", "small", "medium", "large"]
29
  source_languages = {
30
  "en": "English",
31
  "zh": "Chinese",
@@ -203,7 +204,8 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
203
  Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
204
  """
205
 
206
- model = whisper.load_model(whisper_model)
 
207
  time_start = time.time()
208
  if(video_file_path == None):
209
  raise ValueError("Error no video input")
@@ -227,9 +229,19 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
227
  # Transcribe audio
228
  options = dict(language=selected_source_lang, beam_size=5, best_of=5)
229
  transcribe_options = dict(task="transcribe", **options)
230
- result = model.transcribe(audio_file, **transcribe_options)
231
- segments = result["segments"]
232
- print("starting whisper done with whisper")
 
 
 
 
 
 
 
 
 
 
233
  except Exception as e:
234
  raise RuntimeError("Error converting video to audio")
235
 
@@ -330,8 +342,9 @@ with demo:
330
  with gr.Tab("Whisper speaker diarization"):
331
  gr.Markdown('''
332
  <div>
333
- <h1 style='text-align: center'>Whisper speaker diarization</h1>
334
- This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
 
335
  </div>
336
  ''')
337
 
 
1
+ # import whisper
2
+ from faster_whisper import WhisperModel
3
  import datetime
4
  import subprocess
5
  import gradio as gr
 
26
  from transformers import pipeline
27
  import psutil
28
 
29
+ whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
30
  source_languages = {
31
  "en": "English",
32
  "zh": "Chinese",
 
204
  Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
205
  """
206
 
207
+ # model = whisper.load_model(whisper_model)
208
+ model = WhisperModel(whisper_model, device="cuda", compute_type="int8_float16")
209
  time_start = time.time()
210
  if(video_file_path == None):
211
  raise ValueError("Error no video input")
 
229
  # Transcribe audio
230
  options = dict(language=selected_source_lang, beam_size=5, best_of=5)
231
  transcribe_options = dict(task="transcribe", **options)
232
+ segments_raw, info = model.transcribe(audio_file, **transcribe_options)
233
+
234
+ # Convert back to original openai format
235
+ segments = []
236
+ i = 0
237
+ for segment_chunk in segments_raw:
238
+ chunk = {}
239
+ chunk["start"] = segment_chunk.start
240
+ chunk["end"] = segment_chunk.end
241
+ chunk["text"] = segment_chunk.text
242
+ segments.append(chunk)
243
+ i += 1
244
+ print("transcribe audio done with fast whisper")
245
  except Exception as e:
246
  raise RuntimeError("Error converting video to audio")
247
 
 
342
  with gr.Tab("Whisper speaker diarization"):
343
  gr.Markdown('''
344
  <div>
345
+ <h1> style='text-align: center'>Whisper speaker diarization</h1>
346
+ <h2> This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
347
+ and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
348
  </div>
349
  ''')
350