Whisper_speaker_diarization

Runtime error

App Files Files Community

vumichien commited on Apr 19, 2023

Commit

1ce609e

1 Parent(s): 494edc1

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -8

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-import whisper
 import datetime
 import subprocess
 import gradio as gr
@@ -25,7 +26,7 @@ import contextlib
 from transformers import pipeline
 import psutil
-whisper_models = ["base", "small", "medium", "large"]
 source_languages = {
     "en": "English",
     "zh": "Chinese",
@@ -203,7 +204,8 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
     Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
     """
-    model = whisper.load_model(whisper_model)
     time_start = time.time()
     if(video_file_path == None):
         raise ValueError("Error no video input")
@@ -227,9 +229,19 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         # Transcribe audio
         options = dict(language=selected_source_lang, beam_size=5, best_of=5)
         transcribe_options = dict(task="transcribe", **options)
-        result = model.transcribe(audio_file, **transcribe_options)
-        segments = result["segments"]
-        print("starting whisper done with whisper")
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
@@ -330,8 +342,9 @@ with demo:
     with gr.Tab("Whisper speaker diarization"):
         gr.Markdown('''
             <div>
-            <h1 style='text-align: center'>Whisper speaker diarization</h1>
-            This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
             </div>
         ''')

+# import whisper
+from faster_whisper import WhisperModel
 import datetime
 import subprocess
 import gradio as gr
 from transformers import pipeline
 import psutil
+whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
 source_languages = {
     "en": "English",
     "zh": "Chinese",
     Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
     """
+    # model = whisper.load_model(whisper_model)
+    model = WhisperModel(whisper_model, device="cuda", compute_type="int8_float16")
     time_start = time.time()
     if(video_file_path == None):
         raise ValueError("Error no video input")
         # Transcribe audio
         options = dict(language=selected_source_lang, beam_size=5, best_of=5)
         transcribe_options = dict(task="transcribe", **options)
+        segments_raw, info = model.transcribe(audio_file, **transcribe_options)
+        # Convert back to original openai format
+        segments = []
+        i = 0
+        for segment_chunk in segments_raw:
+            chunk = {}
+            chunk["start"] = segment_chunk.start
+            chunk["end"] = segment_chunk.end
+            chunk["text"] = segment_chunk.text
+            segments.append(chunk)
+            i += 1
+        print("transcribe audio done with fast whisper")
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
     with gr.Tab("Whisper speaker diarization"):
         gr.Markdown('''
             <div>
+            <h1> style='text-align: center'>Whisper speaker diarization</h1>
+            <h2> This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
+            and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
             </div>
         ''')