Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Jun 26, 2024

Commit

595b5f3

1 Parent(s): 6148cfe

add diarization

Browse files

Files changed (3) hide show

modules/diarize_pipeline.py +91 -0
modules/diarizer.py +122 -0
modules/whisper_base.py +34 -70

modules/diarize_pipeline.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import numpy as np
+import pandas as pd
+from pyannote.audio import Pipeline
+from typing import Optional, Union
+import torch
+import whisperx
+import os
+class DiarizationPipeline:
+    def __init__(
+        self,
+        model_name="pyannote/speaker-diarization-3.1",
+        cache_dir: str = os.path.join("models", "Whisper", "whisperx"),
+        use_auth_token=None,
+        device: Optional[Union[str, torch.device]] = "cpu",
+    ):
+        if isinstance(device, str):
+            device = torch.device(device)
+        self.model = Pipeline.from_pretrained(
+            model_name,
+            use_auth_token=use_auth_token,
+            cache_dir=cache_dir
+        ).to(device)
+    def __call__(self, audio: Union[str, np.ndarray], min_speakers=None, max_speakers=None):
+        if isinstance(audio, str):
+            audio = whisperx.load_audio(audio)
+        audio_data = {
+            'waveform': torch.from_numpy(audio[None, :]),
+            'sample_rate': whisperx.audio.SAMPLE_RATE
+        }
+        segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
+        diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
+        diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
+        diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
+        return diarize_df
+def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
+    transcript_segments = transcript_result["segments"]
+    for seg in transcript_segments:
+        # assign speaker to segment (if any)
+        diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
+                                                                                            seg['start'])
+        diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
+        intersected = diarize_df[diarize_df["intersection"] > 0]
+        speaker = None
+        if len(intersected) > 0:
+            # Choosing most strong intersection
+            speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+        elif fill_nearest:
+            # Otherwise choosing closest
+            speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+        if speaker is not None:
+            seg["speaker"] = speaker
+        # assign speaker to words
+        if 'words' in seg:
+            for word in seg['words']:
+                if 'start' in word:
+                    diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
+                        diarize_df['start'], word['start'])
+                    diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'],
+                                                                                                  word['start'])
+                    intersected = diarize_df[diarize_df["intersection"] > 0]
+                    word_speaker = None
+                    if len(intersected) > 0:
+                        # Choosing most strong intersection
+                        word_speaker = \
+                            intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+                    elif fill_nearest:
+                        # Otherwise choosing closest
+                        word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+                    if word_speaker is not None:
+                        word["speaker"] = word_speaker
+    return transcript_result
+class Segment:
+    def __init__(self, start, end, speaker=None):
+        self.start = start
+        self.end = end
+        self.speaker = speaker

modules/diarizer.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+import whisperx
+import torch
+from typing import List
+import time
+from modules.diarize_pipeline import DiarizationPipeline
+class Diarizer:
+    def __init__(self,
+                 model_dir: str = os.path.join("models", "Whisper", "whisperx")
+                 ):
+        self.device = self.get_device()
+        self.available_device = self.get_available_device()
+        self.compute_type = "float16"
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.pipe = None
+    def run(self,
+            audio: str,
+            transcribed_result: List[dict],
+            use_auth_token: str,
+            device: str
+            ):
+        """
+        Diarize transcribed result as a post-processing
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio input. This can be file path or binary type.
+        transcribed_result: List[dict]
+            transcribed result through whisper.
+        use_auth_token: str
+            Huggingface token with READ permission. This is only needed the first time you download the model.
+            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+        device: str
+            Device for diarization.
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for running
+        """
+        start_time = time.time()
+        if (device != self.device
+                or self.pipe is None):
+            self.update_pipe(
+                device=device,
+                use_auth_token=use_auth_token
+            )
+        audio = whisperx.load_audio(audio)
+        diarization_segments = self.pipe(audio)
+        diarized_result = whisperx.assign_word_speakers(
+            diarization_segments,
+            {"segments": transcribed_result}
+        )
+        for segment in diarized_result["segments"]:
+            speaker = "None"
+            if "speaker" in segment:
+                speaker = segment["speaker"]
+            segment["text"] = speaker + "|" + segment["text"][1:]
+        elapsed_time = time.time() - start_time
+        return diarized_result["segments"], elapsed_time
+    def update_pipe(self,
+                    use_auth_token: str,
+                    device: str
+                    ):
+        """
+        Set pipeline for diarization
+        Parameters
+        ----------
+        use_auth_token: str
+            Huggingface token with READ permission. This is only needed the first time you download the model.
+            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+        device: str
+            Device for diarization.
+        """
+        os.makedirs(self.model_dir, exist_ok=True)
+        if (not os.listdir(self.model_dir) and
+                not use_auth_token):
+            print(
+                "\nFailed to diarize. You need huggingface token and agree to their requirements to download the diarization model.\n"
+                "Go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and follow their instructions to download the model.\n"
+            )
+            return
+        self.pipe = DiarizationPipeline(
+            use_auth_token=use_auth_token,
+            device=device,
+            cache_dir=self.model_dir
+        )
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def get_available_device():
+        devices = ["cpu"]
+        if torch.cuda.is_available():
+            devices.append("cuda")
+        elif torch.backends.mps.is_available():
+            devices.append("mps")
+        return devices

modules/whisper_base.py CHANGED Viewed

@@ -1,19 +1,18 @@
 import os
 import torch
 from typing import List
-import whisperx
 import whisper
 import gradio as gr
 from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
-from dataclasses import astuple
 import time
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
 from modules.whisper_parameter import *
 class WhisperBase(ABC):
@@ -24,20 +23,16 @@ class WhisperBase(ABC):
         self.model = None
         self.current_model_size = None
         self.model_dir = model_dir
-        self.diarization_model_dir = os.path.join(self.model_dir, "..", "whisperx")
         self.output_dir = output_dir
         os.makedirs(self.output_dir, exist_ok=True)
         os.makedirs(self.model_dir, exist_ok=True)
-        os.makedirs(self.diarization_model_dir, exist_ok=True)
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = self.get_device()
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
-        self.diarization_model = None
-        self.diarization_model_metadata = None
-        self.diarization_pipe = None
     @abstractmethod
     def transcribe(self,
@@ -59,8 +54,28 @@ class WhisperBase(ABC):
             audio: Union[str, BinaryIO, np.ndarray],
             progress: gr.Progress,
             *whisper_params,
-            ):
-        params = WhisperParameters.post_process(*whisper_params)
         if params.lang == "Automatic Detection":
             params.lang = None
@@ -75,65 +90,14 @@ class WhisperBase(ABC):
         )
         if params.is_diarize:
-            if params.lang is None:
-                print("Diarization Failed!! You have to specify the language explicitly to use diarization")
-            else:
-                result, elapsed_time_diarization = self.diarize(
-                    audio=audio,
-                    language_code=params.lang,
-                    use_auth_token=params.hf_token,
-                    transcribed_result=result
-                )
-                elapsed_time += elapsed_time_diarization
-        return result, elapsed_time
-    def diarize(self,
-                audio: str,
-                language_code: str,
-                use_auth_token: str,
-                transcribed_result: List[dict]
-                ):
-        start_time = time.time()
-        if (self.diarization_model is None or
-                self.diarization_model_metadata is None or
-                self.diarization_pipe is None):
-            self._update_diarization_model(
-                language_code=language_code,
-                use_auth_token=use_auth_token
             )
-        audio = whisperx.load_audio(audio)
-        diarization_segments = self.diarization_pipe(audio)
-        diarized_result = whisperx.assign_word_speakers(
-            diarization_segments,
-            {"segments": transcribed_result}
-        )
-        for segment in diarized_result["segments"]:
-            speaker = "None"
-            if "speaker" in segment:
-                speaker = segment["speaker"]
-            segment["text"] = speaker + "|" + segment["text"][1:]
-        elapsed_time = time.time() - start_time
-        return diarized_result["segments"], elapsed_time
-    def _update_diarization_model(self,
-                                  use_auth_token: str,
-                                  language_code: str
-                                  ):
-        print("loading diarization model...")
-        self.diarization_model, self.diarization_model_metadata = whisperx.load_align_model(
-            language_code=language_code,
-            device=self.device,
-            model_dir=self.diarization_model_dir,
-        )
-        self.diarization_pipe = whisperx.DiarizationPipeline(
-            use_auth_token=use_auth_token,
-            device=self.device
-        )
     def transcribe_file(self,
                         files: list,
@@ -156,7 +120,7 @@ class WhisperBase(ABC):
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
@@ -223,7 +187,7 @@ class WhisperBase(ABC):
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
@@ -278,7 +242,7 @@ class WhisperBase(ABC):
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------

 import os
 import torch
 from typing import List
 import whisper
 import gradio as gr
 from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
 import time
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
 from modules.whisper_parameter import *
+from modules.diarizer import Diarizer
 class WhisperBase(ABC):
         self.model = None
         self.current_model_size = None
         self.model_dir = model_dir
         self.output_dir = output_dir
         os.makedirs(self.output_dir, exist_ok=True)
         os.makedirs(self.model_dir, exist_ok=True)
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = self.get_device()
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
+        self.diarizer = Diarizer()
     @abstractmethod
     def transcribe(self,
             audio: Union[str, BinaryIO, np.ndarray],
             progress: gr.Progress,
             *whisper_params,
+            ) -> Tuple[List[dict], float]:
+        """
+        Run transcription with conditional post-processing.
+        The diarization will be performed in post-processing if enabled.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio input. This can be file path or binary type.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for running
+        """
+        params = WhisperParameters.as_value(*whisper_params)
         if params.lang == "Automatic Detection":
             params.lang = None
         )
         if params.is_diarize:
+            result, elapsed_time_diarization = self.diarizer.run(
+                audio=audio,
+                use_auth_token=params.hf_token,
+                transcribed_result=result,
+                device=self.device
             )
+            elapsed_time += elapsed_time_diarization
+        return result, elapsed_time
     def transcribe_file(self,
                         files: list,
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------