Whisper_speaker_diarization

Running on T4

App Files Files Community

vumichien commited on Jan 2, 2023

Commit

c5a0faa

•

1 Parent(s): 93d1452

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -8

app.py CHANGED Viewed

@@ -172,11 +172,11 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         # Read and convert youtube video
         _,file_ending = os.path.splitext(f'{video_file_path}')
         print(f'file enging is {file_ending}')
         print("starting conversion to wav")
-        os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
         # Get duration
-        audio_file = video_file_path.replace(file_ending, ".wav")
         with contextlib.closing(wave.open(audio_file,'r')) as f:
             frames = f.getnframes()
             rate = f.getframerate()
@@ -184,10 +184,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         print(f"conversion to wav ready, duration of audio file: {duration}")
         # Transcribe audio
-        # options = dict(language=selected_source_lang, beam_size=5, best_of=5)
-        # transcribe_options = dict(task="transcribe", **options)
-        # result = model.transcribe(audio_file, **transcribe_options)
-        result = model.transcribe(audio_file, task="transcribe", language=selected_source_lang)
         segments = result["segments"]
         print("starting whisper done with whisper")
     except Exception as e:
@@ -243,6 +242,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
 # ---- Gradio Layout -----
 video_in = gr.Video(label="Video file", mirror_webcam=False)
 youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 video_out = gr.Video(label="Video Out", mirror_webcam=False)
@@ -305,8 +305,8 @@ with demo:
             with gr.Column():
                 gr.Markdown('''
                 ##### Here you can start the transcription process.
-                ##### Please select source language for transcription.
-                ##### Please select number of speakers for getting better results.
                 ''')
             selected_source_lang.render()
             selected_whisper_model.render()

         # Read and convert youtube video
         _,file_ending = os.path.splitext(f'{video_file_path}')
         print(f'file enging is {file_ending}')
+        audio_file = video_file_path.replace(file_ending, ".wav")
         print("starting conversion to wav")
+        os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
         # Get duration
         with contextlib.closing(wave.open(audio_file,'r')) as f:
             frames = f.getnframes()
             rate = f.getframerate()
         print(f"conversion to wav ready, duration of audio file: {duration}")
         # Transcribe audio
+        options = dict(language=selected_source_lang, beam_size=5, best_of=5)
+        transcribe_options = dict(task="transcribe", **options)
+        result = model.transcribe(audio_file, **transcribe_options)
         segments = result["segments"]
         print("starting whisper done with whisper")
     except Exception as e:
 # ---- Gradio Layout -----
+# Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
 video_in = gr.Video(label="Video file", mirror_webcam=False)
 youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 video_out = gr.Video(label="Video Out", mirror_webcam=False)
             with gr.Column():
                 gr.Markdown('''
                 ##### Here you can start the transcription process.
+                ##### Please select the source language for transcription.
+                ##### You should select a number of speakers for getting better results.
                 ''')
             selected_source_lang.render()
             selected_whisper_model.render()