Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
import whisper
|
|
|
2 |
import datetime
|
3 |
import subprocess
|
4 |
import gradio as gr
|
@@ -25,7 +26,7 @@ import contextlib
|
|
25 |
from transformers import pipeline
|
26 |
import psutil
|
27 |
|
28 |
-
whisper_models = ["base", "small", "medium", "large"]
|
29 |
source_languages = {
|
30 |
"en": "English",
|
31 |
"zh": "Chinese",
|
@@ -203,7 +204,8 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
203 |
Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
|
204 |
"""
|
205 |
|
206 |
-
model = whisper.load_model(whisper_model)
|
|
|
207 |
time_start = time.time()
|
208 |
if(video_file_path == None):
|
209 |
raise ValueError("Error no video input")
|
@@ -227,9 +229,19 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
227 |
# Transcribe audio
|
228 |
options = dict(language=selected_source_lang, beam_size=5, best_of=5)
|
229 |
transcribe_options = dict(task="transcribe", **options)
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
except Exception as e:
|
234 |
raise RuntimeError("Error converting video to audio")
|
235 |
|
@@ -330,8 +342,9 @@ with demo:
|
|
330 |
with gr.Tab("Whisper speaker diarization"):
|
331 |
gr.Markdown('''
|
332 |
<div>
|
333 |
-
<h1 style='text-align: center'>Whisper speaker diarization</h1>
|
334 |
-
This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a>
|
|
|
335 |
</div>
|
336 |
''')
|
337 |
|
|
|
1 |
+
# import whisper
|
2 |
+
from faster_whisper import WhisperModel
|
3 |
import datetime
|
4 |
import subprocess
|
5 |
import gradio as gr
|
|
|
26 |
from transformers import pipeline
|
27 |
import psutil
|
28 |
|
29 |
+
whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
|
30 |
source_languages = {
|
31 |
"en": "English",
|
32 |
"zh": "Chinese",
|
|
|
204 |
Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
|
205 |
"""
|
206 |
|
207 |
+
# model = whisper.load_model(whisper_model)
|
208 |
+
model = WhisperModel(whisper_model, device="cuda", compute_type="int8_float16")
|
209 |
time_start = time.time()
|
210 |
if(video_file_path == None):
|
211 |
raise ValueError("Error no video input")
|
|
|
229 |
# Transcribe audio
|
230 |
options = dict(language=selected_source_lang, beam_size=5, best_of=5)
|
231 |
transcribe_options = dict(task="transcribe", **options)
|
232 |
+
segments_raw, info = model.transcribe(audio_file, **transcribe_options)
|
233 |
+
|
234 |
+
# Convert back to original openai format
|
235 |
+
segments = []
|
236 |
+
i = 0
|
237 |
+
for segment_chunk in segments_raw:
|
238 |
+
chunk = {}
|
239 |
+
chunk["start"] = segment_chunk.start
|
240 |
+
chunk["end"] = segment_chunk.end
|
241 |
+
chunk["text"] = segment_chunk.text
|
242 |
+
segments.append(chunk)
|
243 |
+
i += 1
|
244 |
+
print("transcribe audio done with fast whisper")
|
245 |
except Exception as e:
|
246 |
raise RuntimeError("Error converting video to audio")
|
247 |
|
|
|
342 |
with gr.Tab("Whisper speaker diarization"):
|
343 |
gr.Markdown('''
|
344 |
<div>
|
345 |
+
<h1> style='text-align: center'>Whisper speaker diarization</h1>
|
346 |
+
<h2> This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
|
347 |
+
and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
|
348 |
</div>
|
349 |
''')
|
350 |
|