Whisper_speaker_diarization

Runtime error

App Files Files Community

whisper

#14

by AdinEnvironment - opened Mar 29, 2023

base: refs/heads/main

←

from: refs/pr/14

Discussion Files changed

+42

-76

Files changed (2) hide show

app.py +35 -67
requirements.txt +7 -9

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
-# import whisper
-from faster_whisper import WhisperModel
 import datetime
 import subprocess
 import gradio as gr
@@ -13,7 +12,6 @@ from sklearn.cluster import AgglomerativeClustering
 from sklearn.metrics import silhouette_score
 from pytube import YouTube
-import yt_dlp
 import torch
 import pyannote.audio
 from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
@@ -27,7 +25,7 @@ import contextlib
 from transformers import pipeline
 import psutil
-whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
 source_languages = {
     "en": "English",
     "zh": "Chinese",
@@ -175,48 +173,26 @@ def _return_yt_html_embed(yt_url):
     return HTML_str
 def yt_transcribe(yt_url):
-    # yt = YouTube(yt_url)
-    # html_embed_str = _return_yt_html_embed(yt_url)
-    # stream = yt.streams.filter(only_audio=True)[0]
-    # stream.download(filename="audio.mp3")
-    ydl_opts = {
-        'format': 'bestvideo*+bestaudio/best',
-        'postprocessors': [{
-            'key': 'FFmpegExtractAudio',
-            'preferredcodec': 'mp3',
-            'preferredquality': '192',
-        }],
-        'outtmpl':'audio.%(ext)s',
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        ydl.download([yt_url])
     text = pipe("audio.mp3")["text"]
     return html_embed_str, text
 def convert_time(secs):
     return datetime.timedelta(seconds=round(secs))
 def get_youtube(video_url):
-    # yt = YouTube(video_url)
-    # abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
-    ydl_opts = {
-      'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        info = ydl.extract_info(video_url, download=False)
-        abs_video_path = ydl.prepare_filename(info)
-        ydl.process_info(info)
     print("Success download video")
     print(abs_video_path)
     return abs_video_path
-def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
     """
     # Transcribe youtube link using OpenAI Whisper
     1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
@@ -227,9 +203,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
     Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
     """
-    # model = whisper.load_model(whisper_model)
-    # model = WhisperModel(whisper_model, device="cuda", compute_type="int8_float16")
-    model = WhisperModel(whisper_model, compute_type="int8")
     time_start = time.time()
     if(video_file_path == None):
         raise ValueError("Error no video input")
@@ -253,19 +227,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         # Transcribe audio
         options = dict(language=selected_source_lang, beam_size=5, best_of=5)
         transcribe_options = dict(task="transcribe", **options)
-        segments_raw, info = model.transcribe(audio_file, **transcribe_options)
-        # Convert back to original openai format
-        segments = []
-        i = 0
-        for segment_chunk in segments_raw:
-            chunk = {}
-            chunk["start"] = segment_chunk.start
-            chunk["end"] = segment_chunk.end
-            chunk["text"] = segment_chunk.text
-            segments.append(chunk)
-            i += 1
-        print("transcribe audio done with fast whisper")
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
@@ -286,19 +250,22 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         embeddings = np.nan_to_num(embeddings)
         print(f'Embedding shape: {embeddings.shape}')
-        if num_speakers == 0:
         # Find the best number of speakers
-            score_num_speakers = {}
-            for num_speakers in range(2, 10+1):
-                clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
-                score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
-                score_num_speakers[num_speakers] = score
-            best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
-            print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
         else:
-            best_num_speaker = num_speakers
         # Assign speaker label
         clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
         labels = clustering.labels_
@@ -353,7 +320,8 @@ df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
 memory = psutil.virtual_memory()
 selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
 selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
-number_speakers = gr.Number(precision=0, value=0, label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers", interactive=True)
 system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
 download_transcript = gr.File(label="Download transcript")
 transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
@@ -367,8 +335,7 @@ with demo:
         gr.Markdown('''
             <div>
             <h1 style='text-align: center'>Whisper speaker diarization</h1>
-            This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
-            and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
             </div>
         ''')
@@ -411,10 +378,11 @@ with demo:
                     ''')
                 selected_source_lang.render()
                 selected_whisper_model.render()
-                number_speakers.render()
                 transcribe_btn = gr.Button("Transcribe audio and diarization")
                 transcribe_btn.click(speech_to_text,
-                                     [video_in, selected_source_lang, selected_whisper_model, number_speakers],
                                      [transcription_df, system_info, download_transcript]
                                     )

+import whisper
 import datetime
 import subprocess
 import gradio as gr
 from sklearn.metrics import silhouette_score
 from pytube import YouTube
 import torch
 import pyannote.audio
 from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
 from transformers import pipeline
 import psutil
+whisper_models = ["base", "small", "medium", "large"]
 source_languages = {
     "en": "English",
     "zh": "Chinese",
     return HTML_str
 def yt_transcribe(yt_url):
+    yt = YouTube(yt_url)
+    html_embed_str = _return_yt_html_embed(yt_url)
+    stream = yt.streams.filter(only_audio=True)[0]
+    stream.download(filename="audio.mp3")
     text = pipe("audio.mp3")["text"]
     return html_embed_str, text
 def convert_time(secs):
     return datetime.timedelta(seconds=round(secs))
 def get_youtube(video_url):
+    yt = YouTube(video_url)
+    abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
     print("Success download video")
     print(abs_video_path)
     return abs_video_path
+def speech_to_text(video_file_path, selected_source_lang, whisper_model, min_num_speakers, max_number_speakers):
     """
     # Transcribe youtube link using OpenAI Whisper
     1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
     Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
     """
+    model = whisper.load_model(whisper_model)
     time_start = time.time()
     if(video_file_path == None):
         raise ValueError("Error no video input")
         # Transcribe audio
         options = dict(language=selected_source_lang, beam_size=5, best_of=5)
         transcribe_options = dict(task="transcribe", **options)
+        result = model.transcribe(audio_file, **transcribe_options)
+        segments = result["segments"]
+        print("starting whisper done with whisper")
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
         embeddings = np.nan_to_num(embeddings)
         print(f'Embedding shape: {embeddings.shape}')
         # Find the best number of speakers
+        if min_num_speakers > max_number_speakers:
+            min_speakers = max_number_speakers
+            max_speakers = min_num_speakers
         else:
+            min_speakers = min_num_speakers
+            max_speakers = max_number_speakers
+        score_num_speakers = {}
+        for num_speakers in range(min_speakers, max_speakers+1):
+            clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
+            score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
+            score_num_speakers[num_speakers] = score
+        best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
+        print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
         # Assign speaker label
         clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
         labels = clustering.labels_
 memory = psutil.virtual_memory()
 selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
 selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
+input_min_number_speakers = gr.Number(precision=0, value=2, label="Select minimum number of speakers", interactive=True)
+input_max_number_speakers = gr.Number(precision=0, value=2, label="Select maximum number of speakers", interactive=True)
 system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
 download_transcript = gr.File(label="Download transcript")
 transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
         gr.Markdown('''
             <div>
             <h1 style='text-align: center'>Whisper speaker diarization</h1>
+            This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
             </div>
         ''')
                     ''')
                 selected_source_lang.render()
                 selected_whisper_model.render()
+                input_min_number_speakers.render()
+                input_max_number_speakers.render()
                 transcribe_btn = gr.Button("Transcribe audio and diarization")
                 transcribe_btn.click(speech_to_text,
+                                     [video_in, selected_source_lang, selected_whisper_model, input_min_number_speakers, input_max_number_speakers],
                                      [transcription_df, system_info, download_transcript]
                                     )

requirements.txt CHANGED Viewed

@@ -1,22 +1,20 @@
 git+https://github.com/huggingface/transformers
 git+https://github.com/pyannote/pyannote-audio
 git+https://github.com/openai/whisper.git
-gradio
 ffmpeg-python
-pandas
-pytube
 sacremoses
 sentencepiece
 tokenizers
 torch
 torchaudio
-tqdm
-EasyNMT
 nltk
 transformers
 pysrt
-psutil
 requests
-gpuinfo
-faster-whisper
-yt-dlp

 git+https://github.com/huggingface/transformers
 git+https://github.com/pyannote/pyannote-audio
 git+https://github.com/openai/whisper.git
+gradio==3.12
 ffmpeg-python
+pandas==1.5.0
+pytube==12.1.0
 sacremoses
 sentencepiece
 tokenizers
 torch
 torchaudio
+tqdm==4.64.1
+EasyNMT==2.0.2
 nltk
 transformers
 pysrt
+psutil==5.9.2
 requests
+gpuinfo