Spaces:

leekwoon
/

Whisper-FastAPI

Sleeping

App Files Files Community

dahyedahye commited on Sep 2, 2024

Commit

ae3884d

1 Parent(s): 9271e46

Add application file

Browse files

Files changed (38) hide show

models/models will be saved here.txt +0 -0
modules/__init__.py +0 -0
modules/__pycache__/__init__.cpython-310.pyc +0 -0
modules/diarize/__init__.py +0 -0
modules/diarize/__pycache__/__init__.cpython-310.pyc +0 -0
modules/diarize/__pycache__/diarize_pipeline.cpython-310.pyc +0 -0
modules/diarize/__pycache__/diarizer.cpython-310.pyc +0 -0
modules/diarize/audio_loader.py +179 -0
modules/diarize/diarize_pipeline.py +94 -0
modules/diarize/diarizer.py +132 -0
modules/translation/__init__.py +0 -0
modules/translation/deepl_api.py +201 -0
modules/translation/nllb_inference.py +276 -0
modules/translation/translation_base.py +151 -0
modules/utils/__init__.py +0 -0
modules/utils/__pycache__/__init__.cpython-310.pyc +0 -0
modules/utils/__pycache__/files_manager.cpython-310.pyc +0 -0
modules/utils/__pycache__/subtitle_manager.cpython-310.pyc +0 -0
modules/utils/__pycache__/youtube_manager.cpython-310.pyc +0 -0
modules/utils/files_manager.py +39 -0
modules/utils/subtitle_manager.py +135 -0
modules/utils/youtube_manager.py +15 -0
modules/vad/__init__.py +0 -0
modules/vad/silero_vad.py +264 -0
modules/whisper/__init__.py +0 -0
modules/whisper/__pycache__/__init__.cpython-310.pyc +0 -0
modules/whisper/__pycache__/faster_whisper_inference.cpython-310.pyc +0 -0
modules/whisper/__pycache__/whisper_base.cpython-310.pyc +0 -0
modules/whisper/__pycache__/whisper_factory.cpython-310.pyc +0 -0
modules/whisper/__pycache__/whisper_parameter.cpython-310.pyc +0 -0
modules/whisper/faster_whisper_inference.py +191 -0
modules/whisper/insanely_fast_whisper_inference.py +185 -0
modules/whisper/whisper_Inference.py +101 -0
modules/whisper/whisper_base.py +436 -0
modules/whisper/whisper_factory.py +81 -0
modules/whisper/whisper_parameter.py +277 -0
outputs/outputs are saved here.txt +0 -0
outputs/translations/outputs for translation are saved here.txt +0 -0

models/models will be saved here.txt ADDED Viewed

File without changes

modules/__init__.py ADDED Viewed

File without changes

modules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (167 Bytes). View file

modules/diarize/__init__.py ADDED Viewed

File without changes

modules/diarize/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (175 Bytes). View file

modules/diarize/__pycache__/diarize_pipeline.cpython-310.pyc ADDED Viewed

Binary file (3.06 kB). View file

modules/diarize/__pycache__/diarizer.cpython-310.pyc ADDED Viewed

Binary file (4.14 kB). View file

modules/diarize/audio_loader.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
+import os
+import subprocess
+from functools import lru_cache
+from typing import Optional, Union
+from scipy.io.wavfile import write
+import tempfile
+import numpy as np
+import torch
+import torch.nn.functional as F
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000 frames in a mel spectrogram input
+N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
+FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
+TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
+def load_audio(file: Union[str, np.ndarray], sr: int = SAMPLE_RATE) -> np.ndarray:
+    """
+    Open an audio file or process a numpy array containing audio data as mono waveform, resampling as necessary.
+    Parameters
+    ----------
+    file: Union[str, np.ndarray]
+        The audio file to open or a numpy array containing the audio data.
+    sr: int
+        The sample rate to resample the audio if necessary.
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    if isinstance(file, np.ndarray):
+        if file.dtype != np.float32:
+            file = file.astype(np.float32)
+        if file.ndim > 1:
+            file = np.mean(file, axis=1)
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        write(temp_file.name, SAMPLE_RATE, (file * 32768).astype(np.int16))
+        temp_file_path = temp_file.name
+        temp_file.close()
+    else:
+        temp_file_path = file
+    try:
+        cmd = [
+            "ffmpeg",
+            "-nostdin",
+            "-threads",
+            "0",
+            "-i",
+            temp_file_path,
+            "-f",
+            "s16le",
+            "-ac",
+            "1",
+            "-acodec",
+            "pcm_s16le",
+            "-ar",
+            str(sr),
+            "-",
+        ]
+        out = subprocess.run(cmd, capture_output=True, check=True).stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    finally:
+        if isinstance(file, np.ndarray):
+            os.remove(temp_file_path)
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if torch.is_tensor(array):
+        if array.shape[axis] > length:
+            array = array.index_select(
+                dim=axis, index=torch.arange(length, device=array.device)
+            )
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
+    else:
+        if array.shape[axis] > length:
+            array = array.take(indices=range(length), axis=axis)
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = np.pad(array, pad_widths)
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[str, np.ndarray, torch.Tensor],
+    n_mels: int,
+    padding: int = 0,
+    device: Optional[Union[str, torch.device]] = None,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    padding: int
+        Number of zero samples to pad to the right
+    device: Optional[Union[str, torch.device]]
+        If given, the audio tensor is moved to this device before STFT
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec

modules/diarize/diarize_pipeline.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
+import numpy as np
+import pandas as pd
+import os
+from pyannote.audio import Pipeline
+from typing import Optional, Union
+import torch
+from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
+class DiarizationPipeline:
+    def __init__(
+        self,
+        model_name="pyannote/speaker-diarization-3.1",
+        cache_dir: str = os.path.join("models", "Diarization"),
+        use_auth_token=None,
+        device: Optional[Union[str, torch.device]] = "cpu",
+    ):
+        if isinstance(device, str):
+            device = torch.device(device)
+        self.model = Pipeline.from_pretrained(
+            model_name,
+            use_auth_token=use_auth_token,
+            cache_dir=cache_dir
+        ).to(device)
+    def __call__(self, audio: Union[str, np.ndarray], min_speakers=None, max_speakers=None):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio_data = {
+            'waveform': torch.from_numpy(audio[None, :]),
+            'sample_rate': SAMPLE_RATE
+        }
+        segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
+        diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
+        diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
+        diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
+        return diarize_df
+def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
+    transcript_segments = transcript_result["segments"]
+    for seg in transcript_segments:
+        # assign speaker to segment (if any)
+        diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
+                                                                                            seg['start'])
+        diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
+        intersected = diarize_df[diarize_df["intersection"] > 0]
+        speaker = None
+        if len(intersected) > 0:
+            # Choosing most strong intersection
+            speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+        elif fill_nearest:
+            # Otherwise choosing closest
+            speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+        if speaker is not None:
+            seg["speaker"] = speaker
+        # assign speaker to words
+        if 'words' in seg:
+            for word in seg['words']:
+                if 'start' in word:
+                    diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
+                        diarize_df['start'], word['start'])
+                    diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'],
+                                                                                                  word['start'])
+                    intersected = diarize_df[diarize_df["intersection"] > 0]
+                    word_speaker = None
+                    if len(intersected) > 0:
+                        # Choosing most strong intersection
+                        word_speaker = \
+                            intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+                    elif fill_nearest:
+                        # Otherwise choosing closest
+                        word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+                    if word_speaker is not None:
+                        word["speaker"] = word_speaker
+    return transcript_result
+class Segment:
+    def __init__(self, start, end, speaker=None):
+        self.start = start
+        self.end = end
+        self.speaker = speaker

modules/diarize/diarizer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import torch
+from typing import List, Union, BinaryIO, Optional
+import numpy as np
+import time
+import logging
+from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
+from modules.diarize.audio_loader import load_audio
+class Diarizer:
+    def __init__(self,
+                 model_dir: str = os.path.join("models", "Diarization")
+                 ):
+        self.device = self.get_device()
+        self.available_device = self.get_available_device()
+        self.compute_type = "float16"
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.pipe = None
+    def run(self,
+            audio: Union[str, BinaryIO, np.ndarray],
+            transcribed_result: List[dict],
+            use_auth_token: str,
+            device: Optional[str] = None
+            ):
+        """
+        Diarize transcribed result as a post-processing
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio input. This can be file path or binary type.
+        transcribed_result: List[dict]
+            transcribed result through whisper.
+        use_auth_token: str
+            Huggingface token with READ permission. This is only needed the first time you download the model.
+            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+        device: Optional[str]
+            Device for diarization.
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for running
+        """
+        start_time = time.time()
+        if device is None:
+            device = self.device
+        if device != self.device or self.pipe is None:
+            self.update_pipe(
+                device=device,
+                use_auth_token=use_auth_token
+            )
+        audio = load_audio(audio)
+        diarization_segments = self.pipe(audio)
+        diarized_result = assign_word_speakers(
+            diarization_segments,
+            {"segments": transcribed_result}
+        )
+        for segment in diarized_result["segments"]:
+            speaker = "None"
+            if "speaker" in segment:
+                speaker = segment["speaker"]
+            segment["text"] = speaker + "|" + segment["text"].strip()
+        elapsed_time = time.time() - start_time
+        return diarized_result["segments"], elapsed_time
+    def update_pipe(self,
+                    use_auth_token: str,
+                    device: str
+                    ):
+        """
+        Set pipeline for diarization
+        Parameters
+        ----------
+        use_auth_token: str
+            Huggingface token with READ permission. This is only needed the first time you download the model.
+            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+        device: str
+            Device for diarization.
+        """
+        self.device = device
+        os.makedirs(self.model_dir, exist_ok=True)
+        if (not os.listdir(self.model_dir) and
+                not use_auth_token):
+            print(
+                "\nFailed to diarize. You need huggingface token and agree to their requirements to download the diarization model.\n"
+                "Go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and follow their instructions to download the model.\n"
+            )
+            return
+        logger = logging.getLogger("speechbrain.utils.train_logger")
+        # Disable redundant torchvision warning message
+        logger.disabled = True
+        self.pipe = DiarizationPipeline(
+            use_auth_token=use_auth_token,
+            device=device,
+            cache_dir=self.model_dir
+        )
+        logger.disabled = False
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def get_available_device():
+        devices = ["cpu"]
+        if torch.cuda.is_available():
+            devices.append("cuda")
+        elif torch.backends.mps.is_available():
+            devices.append("mps")
+        return devices

modules/translation/__init__.py ADDED Viewed

File without changes

modules/translation/deepl_api.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import requests
+import time
+import os
+from datetime import datetime
+import gradio as gr
+from modules.utils.subtitle_manager import *
+"""
+This is written with reference to the DeepL API documentation.
+If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
+"""
+DEEPL_AVAILABLE_TARGET_LANGS = {
+    'Bulgarian': 'BG',
+    'Czech': 'CS',
+    'Danish': 'DA',
+    'German': 'DE',
+    'Greek': 'EL',
+    'English': 'EN',
+    'English (British)': 'EN-GB',
+    'English (American)': 'EN-US',
+    'Spanish': 'ES',
+    'Estonian': 'ET',
+    'Finnish': 'FI',
+    'French': 'FR',
+    'Hungarian': 'HU',
+    'Indonesian': 'ID',
+    'Italian': 'IT',
+    'Japanese': 'JA',
+    'Korean': 'KO',
+    'Lithuanian': 'LT',
+    'Latvian': 'LV',
+    'Norwegian (Bokmål)': 'NB',
+    'Dutch': 'NL',
+    'Polish': 'PL',
+    'Portuguese': 'PT',
+    'Portuguese (Brazilian)': 'PT-BR',
+    'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
+    'Romanian': 'RO',
+    'Russian': 'RU',
+    'Slovak': 'SK',
+    'Slovenian': 'SL',
+    'Swedish': 'SV',
+    'Turkish': 'TR',
+    'Ukrainian': 'UK',
+    'Chinese (simplified)': 'ZH'
+}
+DEEPL_AVAILABLE_SOURCE_LANGS = {
+    'Automatic Detection': None,
+    'Bulgarian': 'BG',
+    'Czech': 'CS',
+    'Danish': 'DA',
+    'German': 'DE',
+    'Greek': 'EL',
+    'English': 'EN',
+    'Spanish': 'ES',
+    'Estonian': 'ET',
+    'Finnish': 'FI',
+    'French': 'FR',
+    'Hungarian': 'HU',
+    'Indonesian': 'ID',
+    'Italian': 'IT',
+    'Japanese': 'JA',
+    'Korean': 'KO',
+    'Lithuanian': 'LT',
+    'Latvian': 'LV',
+    'Norwegian (Bokmål)': 'NB',
+    'Dutch': 'NL',
+    'Polish': 'PL',
+    'Portuguese (all Portuguese varieties mixed)': 'PT',
+    'Romanian': 'RO',
+    'Russian': 'RU',
+    'Slovak': 'SK',
+    'Slovenian': 'SL',
+    'Swedish': 'SV',
+    'Turkish': 'TR',
+    'Ukrainian': 'UK',
+    'Chinese': 'ZH'
+}
+class DeepLAPI:
+    def __init__(self,
+                 output_dir: str = os.path.join("outputs", "translations")
+                 ):
+        self.api_interval = 1
+        self.max_text_batch_size = 50
+        self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
+        self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
+        self.output_dir = output_dir
+    def translate_deepl(self,
+                        auth_key: str,
+                        fileobjs: list,
+                        source_lang: str,
+                        target_lang: str,
+                        is_pro: bool,
+                        add_timestamp: bool,
+                        progress=gr.Progress()) -> list:
+        """
+        Translate subtitle files using DeepL API
+        Parameters
+        ----------
+        auth_key: str
+            API Key for DeepL from gr.Textbox()
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        source_lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        target_lang: str
+            Target language of the file to transcribe from gr.Dropdown()
+        is_pro: str
+            Boolean value that is about pro user or not from gr.Checkbox().
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        files_info = {}
+        for fileobj in fileobjs:
+            file_path = fileobj.name
+            file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
+            if file_ext == ".srt":
+                parsed_dicts = parse_srt(file_path=file_path)
+                batch_size = self.max_text_batch_size
+                for batch_start in range(0, len(parsed_dicts), batch_size):
+                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
+                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
+                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
+                                                                    target_lang, is_pro)
+                    for i, translated_text in enumerate(translated_texts):
+                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
+                    progress(batch_end / len(parsed_dicts), desc="Translating..")
+                subtitle = get_serialized_srt(parsed_dicts)
+            elif file_ext == ".vtt":
+                parsed_dicts = parse_vtt(file_path=file_path)
+                batch_size = self.max_text_batch_size
+                for batch_start in range(0, len(parsed_dicts), batch_size):
+                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
+                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
+                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
+                                                                    target_lang, is_pro)
+                    for i, translated_text in enumerate(translated_texts):
+                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
+                    progress(batch_end / len(parsed_dicts), desc="Translating..")
+                subtitle = get_serialized_vtt(parsed_dicts)
+            if add_timestamp:
+                timestamp = datetime.now().strftime("%m%d%H%M%S")
+                file_name += f"-{timestamp}"
+            output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
+            write_file(subtitle, output_path)
+            files_info[file_name] = {"subtitle": subtitle, "path": output_path}
+        total_result = ''
+        for file_name, info in files_info.items():
+            total_result += '------------------------------------\n'
+            total_result += f'{file_name}\n\n'
+            total_result += f'{info["subtitle"]}'
+        gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+        output_file_paths = [item["path"] for key, item in files_info.items()]
+        return [gr_str, output_file_paths]
+    def request_deepl_translate(self,
+                                auth_key: str,
+                                text: list,
+                                source_lang: str,
+                                target_lang: str,
+                                is_pro: bool):
+        """Request API response to DeepL server"""
+        url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
+        headers = {
+            'Authorization': f'DeepL-Auth-Key {auth_key}'
+        }
+        data = {
+            'text': text,
+            'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
+            'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
+        }
+        response = requests.post(url, headers=headers, data=data).json()
+        time.sleep(self.api_interval)
+        return response["translations"]

modules/translation/nllb_inference.py ADDED Viewed

	@@ -0,0 +1,276 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import gradio as gr
+import os
+from modules.translation.translation_base import TranslationBase
+class NLLBInference(TranslationBase):
+    def __init__(self,
+                 model_dir: str = os.path.join("models", "NLLB"),
+                 output_dir: str = os.path.join("outputs", "translations")
+                 ):
+        super().__init__(
+            model_dir=model_dir,
+            output_dir=output_dir
+        )
+        self.tokenizer = None
+        self.available_models = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
+        self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
+        self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
+        self.pipeline = None
+    def translate(self,
+                  text: str,
+                  max_length: int
+                  ):
+        result = self.pipeline(
+            text,
+            max_length=max_length
+        )
+        return result[0]['translation_text']
+    def update_model(self,
+                     model_size: str,
+                     src_lang: str,
+                     tgt_lang: str,
+                     progress: gr.Progress
+                     ):
+        if model_size != self.current_model_size or self.model is None:
+            print("\nInitializing NLLB Model..\n")
+            progress(0, desc="Initializing NLLB Model..")
+            self.current_model_size = model_size
+            local_files_only = self.is_model_exists(self.current_model_size)
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                               cache_dir=self.model_dir,
+                                                               local_files_only=local_files_only)
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                           cache_dir=os.path.join(self.model_dir, "tokenizers"),
+                                                           local_files_only=local_files_only)
+        src_lang = NLLB_AVAILABLE_LANGS[src_lang]
+        tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
+        self.pipeline = pipeline("translation",
+                                 model=self.model,
+                                 tokenizer=self.tokenizer,
+                                 src_lang=src_lang,
+                                 tgt_lang=tgt_lang,
+                                 device=self.device)
+    def is_model_exists(self,
+                        model_size: str):
+        """Check if model exists or not (Only facebook model)"""
+        prefix = "models--facebook--"
+        _id, model_size_name = model_size.split("/")
+        model_dir_name = prefix + model_size_name
+        model_dir_path = os.path.join(self.model_dir, model_dir_name)
+        if os.path.exists(model_dir_path) and os.listdir(model_dir_path):
+            return True
+        return False
+NLLB_AVAILABLE_LANGS = {
+    "Acehnese (Arabic script)": "ace_Arab",
+    "Acehnese (Latin script)": "ace_Latn",
+    "Mesopotamian Arabic": "acm_Arab",
+    "Ta’izzi-Adeni Arabic": "acq_Arab",
+    "Tunisian Arabic": "aeb_Arab",
+    "Afrikaans": "afr_Latn",
+    "South Levantine Arabic": "ajp_Arab",
+    "Akan": "aka_Latn",
+    "Amharic": "amh_Ethi",
+    "North Levantine Arabic": "apc_Arab",
+    "Modern Standard Arabic": "arb_Arab",
+    "Modern Standard Arabic (Romanized)": "arb_Latn",
+    "Najdi Arabic": "ars_Arab",
+    "Moroccan Arabic": "ary_Arab",
+    "Egyptian Arabic": "arz_Arab",
+    "Assamese": "asm_Beng",
+    "Asturian": "ast_Latn",
+    "Awadhi": "awa_Deva",
+    "Central Aymara": "ayr_Latn",
+    "South Azerbaijani": "azb_Arab",
+    "North Azerbaijani": "azj_Latn",
+    "Bashkir": "bak_Cyrl",
+    "Bambara": "bam_Latn",
+    "Balinese": "ban_Latn",
+    "Belarusian": "bel_Cyrl",
+    "Bemba": "bem_Latn",
+    "Bengali": "ben_Beng",
+    "Bhojpuri": "bho_Deva",
+    "Banjar (Arabic script)": "bjn_Arab",
+    "Banjar (Latin script)": "bjn_Latn",
+    "Standard Tibetan": "bod_Tibt",
+    "Bosnian": "bos_Latn",
+    "Buginese": "bug_Latn",
+    "Bulgarian": "bul_Cyrl",
+    "Catalan": "cat_Latn",
+    "Cebuano": "ceb_Latn",
+    "Czech": "ces_Latn",
+    "Chokwe": "cjk_Latn",
+    "Central Kurdish": "ckb_Arab",
+    "Crimean Tatar": "crh_Latn",
+    "Welsh": "cym_Latn",
+    "Danish": "dan_Latn",
+    "German": "deu_Latn",
+    "Southwestern Dinka": "dik_Latn",
+    "Dyula": "dyu_Latn",
+    "Dzongkha": "dzo_Tibt",
+    "Greek": "ell_Grek",
+    "English": "eng_Latn",
+    "Esperanto": "epo_Latn",
+    "Estonian": "est_Latn",
+    "Basque": "eus_Latn",
+    "Ewe": "ewe_Latn",
+    "Faroese": "fao_Latn",
+    "Fijian": "fij_Latn",
+    "Finnish": "fin_Latn",
+    "Fon": "fon_Latn",
+    "French": "fra_Latn",
+    "Friulian": "fur_Latn",
+    "Nigerian Fulfulde": "fuv_Latn",
+    "Scottish Gaelic": "gla_Latn",
+    "Irish": "gle_Latn",
+    "Galician": "glg_Latn",
+    "Guarani": "grn_Latn",
+    "Gujarati": "guj_Gujr",
+    "Haitian Creole": "hat_Latn",
+    "Hausa": "hau_Latn",
+    "Hebrew": "heb_Hebr",
+    "Hindi": "hin_Deva",
+    "Chhattisgarhi": "hne_Deva",
+    "Croatian": "hrv_Latn",
+    "Hungarian": "hun_Latn",
+    "Armenian": "hye_Armn",
+    "Igbo": "ibo_Latn",
+    "Ilocano": "ilo_Latn",
+    "Indonesian": "ind_Latn",
+    "Icelandic": "isl_Latn",
+    "Italian": "ita_Latn",
+    "Javanese": "jav_Latn",
+    "Japanese": "jpn_Jpan",
+    "Kabyle": "kab_Latn",
+    "Jingpho": "kac_Latn",
+    "Kamba": "kam_Latn",
+    "Kannada": "kan_Knda",
+    "Kashmiri (Arabic script)": "kas_Arab",
+    "Kashmiri (Devanagari script)": "kas_Deva",
+    "Georgian": "kat_Geor",
+    "Central Kanuri (Arabic script)": "knc_Arab",
+    "Central Kanuri (Latin script)": "knc_Latn",
+    "Kazakh": "kaz_Cyrl",
+    "Kabiyè": "kbp_Latn",
+    "Kabuverdianu": "kea_Latn",
+    "Khmer": "khm_Khmr",
+    "Kikuyu": "kik_Latn",
+    "Kinyarwanda": "kin_Latn",
+    "Kyrgyz": "kir_Cyrl",
+    "Kimbundu": "kmb_Latn",
+    "Northern Kurdish": "kmr_Latn",
+    "Kikongo": "kon_Latn",
+    "Korean": "kor_Hang",
+    "Lao": "lao_Laoo",
+    "Ligurian": "lij_Latn",
+    "Limburgish": "lim_Latn",
+    "Lingala": "lin_Latn",
+    "Lithuanian": "lit_Latn",
+    "Lombard": "lmo_Latn",
+    "Latgalian": "ltg_Latn",
+    "Luxembourgish": "ltz_Latn",
+    "Luba-Kasai": "lua_Latn",
+    "Ganda": "lug_Latn",
+    "Luo": "luo_Latn",
+    "Mizo": "lus_Latn",
+    "Standard Latvian": "lvs_Latn",
+    "Magahi": "mag_Deva",
+    "Maithili": "mai_Deva",
+    "Malayalam": "mal_Mlym",
+    "Marathi": "mar_Deva",
+    "Minangkabau (Arabic script)": "min_Arab",
+    "Minangkabau (Latin script)": "min_Latn",
+    "Macedonian": "mkd_Cyrl",
+    "Plateau Malagasy": "plt_Latn",
+    "Maltese": "mlt_Latn",
+    "Meitei (Bengali script)": "mni_Beng",
+    "Halh Mongolian": "khk_Cyrl",
+    "Mossi": "mos_Latn",
+    "Maori": "mri_Latn",
+    "Burmese": "mya_Mymr",
+    "Dutch": "nld_Latn",
+    "Norwegian Nynorsk": "nno_Latn",
+    "Norwegian Bokmål": "nob_Latn",
+    "Nepali": "npi_Deva",
+    "Northern Sotho": "nso_Latn",
+    "Nuer": "nus_Latn",
+    "Nyanja": "nya_Latn",
+    "Occitan": "oci_Latn",
+    "West Central Oromo": "gaz_Latn",
+    "Odia": "ory_Orya",
+    "Pangasinan": "pag_Latn",
+    "Eastern Panjabi": "pan_Guru",
+    "Papiamento": "pap_Latn",
+    "Western Persian": "pes_Arab",
+    "Polish": "pol_Latn",
+    "Portuguese": "por_Latn",
+    "Dari": "prs_Arab",
+    "Southern Pashto": "pbt_Arab",
+    "Ayacucho Quechua": "quy_Latn",
+    "Romanian": "ron_Latn",
+    "Rundi": "run_Latn",
+    "Russian": "rus_Cyrl",
+    "Sango": "sag_Latn",
+    "Sanskrit": "san_Deva",
+    "Santali": "sat_Olck",
+    "Sicilian": "scn_Latn",
+    "Shan": "shn_Mymr",
+    "Sinhala": "sin_Sinh",
+    "Slovak": "slk_Latn",
+    "Slovenian": "slv_Latn",
+    "Samoan": "smo_Latn",
+    "Shona": "sna_Latn",
+    "Sindhi": "snd_Arab",
+    "Somali": "som_Latn",
+    "Southern Sotho": "sot_Latn",
+    "Spanish": "spa_Latn",
+    "Tosk Albanian": "als_Latn",
+    "Sardinian": "srd_Latn",
+    "Serbian": "srp_Cyrl",
+    "Swati": "ssw_Latn",
+    "Sundanese": "sun_Latn",
+    "Swedish": "swe_Latn",
+    "Swahili": "swh_Latn",
+    "Silesian": "szl_Latn",
+    "Tamil": "tam_Taml",
+    "Tatar": "tat_Cyrl",
+    "Telugu": "tel_Telu",
+    "Tajik": "tgk_Cyrl",
+    "Tagalog": "tgl_Latn",
+    "Thai": "tha_Thai",
+    "Tigrinya": "tir_Ethi",
+    "Tamasheq (Latin script)": "taq_Latn",
+    "Tamasheq (Tifinagh script)": "taq_Tfng",
+    "Tok Pisin": "tpi_Latn",
+    "Tswana": "tsn_Latn",
+    "Tsonga": "tso_Latn",
+    "Turkmen": "tuk_Latn",
+    "Tumbuka": "tum_Latn",
+    "Turkish": "tur_Latn",
+    "Twi": "twi_Latn",
+    "Central Atlas Tamazight": "tzm_Tfng",
+    "Uyghur": "uig_Arab",
+    "Ukrainian": "ukr_Cyrl",
+    "Umbundu": "umb_Latn",
+    "Urdu": "urd_Arab",
+    "Northern Uzbek": "uzn_Latn",
+    "Venetian": "vec_Latn",
+    "Vietnamese": "vie_Latn",
+    "Waray": "war_Latn",
+    "Wolof": "wol_Latn",
+    "Xhosa": "xho_Latn",
+    "Eastern Yiddish": "ydd_Hebr",
+    "Yoruba": "yor_Latn",
+    "Yue Chinese": "yue_Hant",
+    "Chinese (Simplified)": "zho_Hans",
+    "Chinese (Traditional)": "zho_Hant",
+    "Standard Malay": "zsm_Latn",
+    "Zulu": "zul_Latn",
+}

modules/translation/translation_base.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import torch
+import gradio as gr
+from abc import ABC, abstractmethod
+from typing import List
+from datetime import datetime
+from modules.whisper.whisper_parameter import *
+from modules.utils.subtitle_manager import *
+class TranslationBase(ABC):
+    def __init__(self,
+                 model_dir: str = os.path.join("models", "NLLB"),
+                 output_dir: str = os.path.join("outputs", "translations")
+                 ):
+        super().__init__()
+        self.model = None
+        self.model_dir = model_dir
+        self.output_dir = output_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.current_model_size = None
+        self.device = self.get_device()
+    @abstractmethod
+    def translate(self,
+                  text: str,
+                  max_length: int
+                  ):
+        pass
+    @abstractmethod
+    def update_model(self,
+                     model_size: str,
+                     src_lang: str,
+                     tgt_lang: str,
+                     progress: gr.Progress
+                     ):
+        pass
+    def translate_file(self,
+                       fileobjs: list,
+                       model_size: str,
+                       src_lang: str,
+                       tgt_lang: str,
+                       max_length: int,
+                       add_timestamp: bool,
+                       progress=gr.Progress()) -> list:
+        """
+        Translate subtitle file from source language to target language
+        Parameters
+        ----------
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        src_lang: str
+            Source language of the file to translate from gr.Dropdown()
+        tgt_lang: str
+            Target language of the file to translate from gr.Dropdown()
+        max_length: int
+            Max length per line to translate
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            self.update_model(model_size=model_size,
+                              src_lang=src_lang,
+                              tgt_lang=tgt_lang,
+                              progress=progress)
+            files_info = {}
+            for fileobj in fileobjs:
+                file_path = fileobj.name
+                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
+                if file_ext == ".srt":
+                    parsed_dicts = parse_srt(file_path=file_path)
+                    total_progress = len(parsed_dicts)
+                    for index, dic in enumerate(parsed_dicts):
+                        progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate(dic["sentence"], max_length=max_length)
+                        dic["sentence"] = translated_text
+                    subtitle = get_serialized_srt(parsed_dicts)
+                elif file_ext == ".vtt":
+                    parsed_dicts = parse_vtt(file_path=file_path)
+                    total_progress = len(parsed_dicts)
+                    for index, dic in enumerate(parsed_dicts):
+                        progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate(dic["sentence"], max_length=max_length)
+                        dic["sentence"] = translated_text
+                    subtitle = get_serialized_vtt(parsed_dicts)
+                if add_timestamp:
+                    timestamp = datetime.now().strftime("%m%d%H%M%S")
+                    file_name += f"-{timestamp}"
+                output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
+                write_file(subtitle, output_path)
+                files_info[file_name] = {"subtitle": subtitle, "path": output_path}
+            total_result = ''
+            for file_name, info in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{info["subtitle"]}'
+            gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+            output_file_paths = [item["path"] for key, item in files_info.items()]
+            return [gr_str, output_file_paths]
+        except Exception as e:
+            print(f"Error: {str(e)}")
+        finally:
+            self.release_cuda_memory()
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def release_cuda_memory():
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+    @staticmethod
+    def remove_input_files(file_paths: List[str]):
+        if not file_paths:
+            return
+        for file_path in file_paths:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)

modules/utils/__init__.py ADDED Viewed

File without changes

modules/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (173 Bytes). View file

modules/utils/__pycache__/files_manager.cpython-310.pyc ADDED Viewed

Binary file (1.43 kB). View file

modules/utils/__pycache__/subtitle_manager.cpython-310.pyc ADDED Viewed

Binary file (3.38 kB). View file

modules/utils/__pycache__/youtube_manager.cpython-310.pyc ADDED Viewed

Binary file (748 Bytes). View file

modules/utils/files_manager.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import fnmatch
+from gradio.utils import NamedString
+def get_media_files(folder_path, include_sub_directory=False):
+    video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv']
+    audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
+    media_extensions = video_extensions + audio_extensions
+    media_files = []
+    if include_sub_directory:
+        for root, _, files in os.walk(folder_path):
+            for extension in media_extensions:
+                media_files.extend(
+                    os.path.join(root, file) for file in fnmatch.filter(files, extension)
+                    if os.path.exists(os.path.join(root, file))
+                )
+    else:
+        for extension in media_extensions:
+            media_files.extend(
+                os.path.join(folder_path, file) for file in fnmatch.filter(os.listdir(folder_path), extension)
+                if os.path.isfile(os.path.join(folder_path, file)) and os.path.exists(os.path.join(folder_path, file))
+            )
+    return media_files
+def format_gradio_files(files: list):
+    if not files:
+        return files
+    gradio_files = []
+    for file in files:
+        gradio_files.append(NamedString(file))
+    return gradio_files

modules/utils/subtitle_manager.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import re
+def timeformat_srt(time):
+    hours = time // 3600
+    minutes = (time - hours * 3600) // 60
+    seconds = time - hours * 3600 - minutes * 60
+    milliseconds = (time - int(time)) * 1000
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
+def timeformat_vtt(time):
+    hours = time // 3600
+    minutes = (time - hours * 3600) // 60
+    seconds = time - hours * 3600 - minutes * 60
+    milliseconds = (time - int(time)) * 1000
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
+def write_file(subtitle, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(subtitle)
+def get_srt(segments):
+    output = ""
+    for i, segment in enumerate(segments):
+        output += f"{i + 1}\n"
+        output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
+        if segment['text'].startswith(' '):
+            segment['text'] = segment['text'][1:]
+        output += f"{segment['text']}\n\n"
+    return output
+def get_vtt(segments):
+    output = "WebVTT\n\n"
+    for i, segment in enumerate(segments):
+        output += f"{i + 1}\n"
+        output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
+        if segment['text'].startswith(' '):
+            segment['text'] = segment['text'][1:]
+        output += f"{segment['text']}\n\n"
+    return output
+def get_txt(segments):
+    output = ""
+    for i, segment in enumerate(segments):
+        if segment['text'].startswith(' '):
+            segment['text'] = segment['text'][1:]
+        output += f"{segment['text']}\n"
+    return output
+def parse_srt(file_path):
+    """Reads SRT file and returns as dict"""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        srt_data = file.read()
+    data = []
+    blocks = srt_data.split('\n\n')
+    for block in blocks:
+        if block.strip() != '':
+            lines = block.strip().split('\n')
+            index = lines[0]
+            timestamp = lines[1]
+            sentence = ' '.join(lines[2:])
+            data.append({
+                "index": index,
+                "timestamp": timestamp,
+                "sentence": sentence
+            })
+    return data
+def parse_vtt(file_path):
+    """Reads WebVTT file and returns as dict"""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        webvtt_data = file.read()
+    data = []
+    blocks = webvtt_data.split('\n\n')
+    for block in blocks:
+        if block.strip() != '' and not block.strip().startswith("WebVTT"):
+            lines = block.strip().split('\n')
+            index = lines[0]
+            timestamp = lines[1]
+            sentence = ' '.join(lines[2:])
+            data.append({
+                "index": index,
+                "timestamp": timestamp,
+                "sentence": sentence
+            })
+    return data
+def get_serialized_srt(dicts):
+    output = ""
+    for dic in dicts:
+        output += f'{dic["index"]}\n'
+        output += f'{dic["timestamp"]}\n'
+        output += f'{dic["sentence"]}\n\n'
+    return output
+def get_serialized_vtt(dicts):
+    output = "WebVTT\n\n"
+    for dic in dicts:
+        output += f'{dic["index"]}\n'
+        output += f'{dic["timestamp"]}\n'
+        output += f'{dic["sentence"]}\n\n'
+    return output
+def safe_filename(name):
+    from app import _args
+    INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
+    safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
+    if not _args.colab:
+        return safe_name
+    # Truncate the filename if it exceeds the max_length (20)
+    if len(safe_name) > 20:
+        file_extension = safe_name.split('.')[-1]
+        if len(file_extension) + 1 < 20:
+            truncated_name = safe_name[:20 - len(file_extension) - 1]
+            safe_name = truncated_name + '.' + file_extension
+        else:
+            safe_name = safe_name[:20]
+    return safe_name

modules/utils/youtube_manager.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from pytubefix import YouTube
+import os
+def get_ytdata(link):
+    return YouTube(link)
+def get_ytmetas(link):
+    yt = YouTube(link)
+    return yt.thumbnail_url, yt.title, yt.description
+def get_ytaudio(ytdata: YouTube):
+    return ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))

modules/vad/__init__.py ADDED Viewed

File without changes

modules/vad/silero_vad.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
+from faster_whisper.vad import VadOptions, get_vad_model
+import numpy as np
+from typing import BinaryIO, Union, List, Optional, Tuple
+import warnings
+import faster_whisper
+from faster_whisper.transcribe import SpeechTimestampsMap, Segment
+import gradio as gr
+class SileroVAD:
+    def __init__(self):
+        self.sampling_rate = 16000
+        self.window_size_samples = 512
+        self.model = None
+    def run(self,
+            audio: Union[str, BinaryIO, np.ndarray],
+            vad_parameters: VadOptions,
+            progress: gr.Progress = gr.Progress()
+            ) -> Tuple[np.ndarray, List[dict]]:
+        """
+        Run VAD
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        vad_parameters:
+            Options for VAD processing.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        np.ndarray
+            Pre-processed audio with VAD
+        List[dict]
+            Chunks of speeches to be used to restore the timestamps later
+        """
+        sampling_rate = self.sampling_rate
+        if not isinstance(audio, np.ndarray):
+            audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
+        duration = audio.shape[0] / sampling_rate
+        duration_after_vad = duration
+        if vad_parameters is None:
+            vad_parameters = VadOptions()
+        elif isinstance(vad_parameters, dict):
+            vad_parameters = VadOptions(**vad_parameters)
+        speech_chunks = self.get_speech_timestamps(
+            audio=audio,
+            vad_options=vad_parameters,
+            progress=progress
+        )
+        audio = self.collect_chunks(audio, speech_chunks)
+        duration_after_vad = audio.shape[0] / sampling_rate
+        return audio, speech_chunks
+    def get_speech_timestamps(
+        self,
+        audio: np.ndarray,
+        vad_options: Optional[VadOptions] = None,
+        progress: gr.Progress = gr.Progress(),
+        **kwargs,
+    ) -> List[dict]:
+        """This method is used for splitting long audios into speech chunks using silero VAD.
+        Args:
+          audio: One dimensional float array.
+          vad_options: Options for VAD processing.
+          kwargs: VAD options passed as keyword arguments for backward compatibility.
+          progress: Gradio progress to indicate progress.
+        Returns:
+          List of dicts containing begin and end samples of each speech chunk.
+        """
+        if self.model is None:
+            self.update_model()
+        if vad_options is None:
+            vad_options = VadOptions(**kwargs)
+        threshold = vad_options.threshold
+        min_speech_duration_ms = vad_options.min_speech_duration_ms
+        max_speech_duration_s = vad_options.max_speech_duration_s
+        min_silence_duration_ms = vad_options.min_silence_duration_ms
+        window_size_samples = self.window_size_samples
+        speech_pad_ms = vad_options.speech_pad_ms
+        sampling_rate = 16000
+        min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
+        speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+        max_speech_samples = (
+                sampling_rate * max_speech_duration_s
+                - window_size_samples
+                - 2 * speech_pad_samples
+        )
+        min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+        min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
+        audio_length_samples = len(audio)
+        state, context = self.model.get_initial_states(batch_size=1)
+        speech_probs = []
+        for current_start_sample in range(0, audio_length_samples, window_size_samples):
+            progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
+            chunk = audio[current_start_sample: current_start_sample + window_size_samples]
+            if len(chunk) < window_size_samples:
+                chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
+            speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
+            speech_probs.append(speech_prob)
+        triggered = False
+        speeches = []
+        current_speech = {}
+        neg_threshold = threshold - 0.15
+        # to save potential segment end (and tolerate some silence)
+        temp_end = 0
+        # to save potential segment limits in case of maximum segment size reached
+        prev_end = next_start = 0
+        for i, speech_prob in enumerate(speech_probs):
+            if (speech_prob >= threshold) and temp_end:
+                temp_end = 0
+                if next_start < prev_end:
+                    next_start = window_size_samples * i
+            if (speech_prob >= threshold) and not triggered:
+                triggered = True
+                current_speech["start"] = window_size_samples * i
+                continue
+            if (
+                    triggered
+                    and (window_size_samples * i) - current_speech["start"] > max_speech_samples
+            ):
+                if prev_end:
+                    current_speech["end"] = prev_end
+                    speeches.append(current_speech)
+                    current_speech = {}
+                    # previously reached silence (< neg_thres) and is still not speech (< thres)
+                    if next_start < prev_end:
+                        triggered = False
+                    else:
+                        current_speech["start"] = next_start
+                    prev_end = next_start = temp_end = 0
+                else:
+                    current_speech["end"] = window_size_samples * i
+                    speeches.append(current_speech)
+                    current_speech = {}
+                    prev_end = next_start = temp_end = 0
+                    triggered = False
+                    continue
+            if (speech_prob < neg_threshold) and triggered:
+                if not temp_end:
+                    temp_end = window_size_samples * i
+                # condition to avoid cutting in very short silence
+                if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
+                    prev_end = temp_end
+                if (window_size_samples * i) - temp_end < min_silence_samples:
+                    continue
+                else:
+                    current_speech["end"] = temp_end
+                    if (
+                            current_speech["end"] - current_speech["start"]
+                    ) > min_speech_samples:
+                        speeches.append(current_speech)
+                    current_speech = {}
+                    prev_end = next_start = temp_end = 0
+                    triggered = False
+                    continue
+        if (
+                current_speech
+                and (audio_length_samples - current_speech["start"]) > min_speech_samples
+        ):
+            current_speech["end"] = audio_length_samples
+            speeches.append(current_speech)
+        for i, speech in enumerate(speeches):
+            if i == 0:
+                speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
+            if i != len(speeches) - 1:
+                silence_duration = speeches[i + 1]["start"] - speech["end"]
+                if silence_duration < 2 * speech_pad_samples:
+                    speech["end"] += int(silence_duration // 2)
+                    speeches[i + 1]["start"] = int(
+                        max(0, speeches[i + 1]["start"] - silence_duration // 2)
+                    )
+                else:
+                    speech["end"] = int(
+                        min(audio_length_samples, speech["end"] + speech_pad_samples)
+                    )
+                    speeches[i + 1]["start"] = int(
+                        max(0, speeches[i + 1]["start"] - speech_pad_samples)
+                    )
+            else:
+                speech["end"] = int(
+                    min(audio_length_samples, speech["end"] + speech_pad_samples)
+                )
+        return speeches
+    def update_model(self):
+        self.model = get_vad_model()
+    @staticmethod
+    def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
+        """Collects and concatenates audio chunks."""
+        if not chunks:
+            return np.array([], dtype=np.float32)
+        return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
+    @staticmethod
+    def format_timestamp(
+        seconds: float,
+        always_include_hours: bool = False,
+        decimal_marker: str = ".",
+    ) -> str:
+        assert seconds >= 0, "non-negative timestamp expected"
+        milliseconds = round(seconds * 1000.0)
+        hours = milliseconds // 3_600_000
+        milliseconds -= hours * 3_600_000
+        minutes = milliseconds // 60_000
+        milliseconds -= minutes * 60_000
+        seconds = milliseconds // 1_000
+        milliseconds -= seconds * 1_000
+        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+        return (
+            f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+        )
+    def restore_speech_timestamps(
+        self,
+        segments: List[dict],
+        speech_chunks: List[dict],
+        sampling_rate: Optional[int] = None,
+    ) -> List[dict]:
+        if sampling_rate is None:
+            sampling_rate = self.sampling_rate
+        ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
+        for segment in segments:
+            segment["start"] = ts_map.get_original_time(segment["start"])
+            segment["end"] = ts_map.get_original_time(segment["end"])
+        return segments

modules/whisper/__init__.py ADDED Viewed

File without changes

modules/whisper/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (175 Bytes). View file

modules/whisper/__pycache__/faster_whisper_inference.cpython-310.pyc ADDED Viewed

Binary file (6.51 kB). View file

modules/whisper/__pycache__/whisper_base.cpython-310.pyc ADDED Viewed

Binary file (12.9 kB). View file

modules/whisper/__pycache__/whisper_factory.cpython-310.pyc ADDED Viewed

Binary file (2.87 kB). View file

modules/whisper/__pycache__/whisper_parameter.cpython-310.pyc ADDED Viewed

Binary file (3.68 kB). View file

modules/whisper/faster_whisper_inference.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import os
+import time
+import numpy as np
+import torch
+from typing import BinaryIO, Union, Tuple, List
+import faster_whisper
+from faster_whisper.vad import VadOptions
+import ast
+import ctranslate2
+import whisper
+import gradio as gr
+from argparse import Namespace
+from modules.whisper.whisper_parameter import *
+from modules.whisper.whisper_base import WhisperBase
+class FasterWhisperInference(WhisperBase):
+    def __init__(self,
+                 model_dir: str = os.path.join("models", "Whisper", "faster-whisper"),
+                 diarization_model_dir: str = os.path.join("models", "Diarization"),
+                 output_dir: str = os.path.join("outputs"),
+                 ):
+        super().__init__(
+            model_dir=model_dir,
+            diarization_model_dir=diarization_model_dir,
+            output_dir=output_dir
+        )
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.model_paths = self.get_model_paths()
+        self.device = self.get_device()
+        self.available_models = self.model_paths.keys()
+        self.available_compute_types = ctranslate2.get_supported_compute_types(
+            "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
+    def transcribe(self,
+                   audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress,
+                   *whisper_params,
+                   ) -> Tuple[List[dict], float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        params = WhisperParameters.as_value(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        # None parameters with Textboxes: https://github.com/gradio-app/gradio/issues/8723
+        if not params.initial_prompt:
+            params.initial_prompt = None
+        if not params.prefix:
+            params.prefix = None
+        if not params.hotwords:
+            params.hotwords = None
+        params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
+        segments, info = self.model.transcribe(
+            audio=audio,
+            language=params.lang,
+            task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+            beam_size=params.beam_size,
+            log_prob_threshold=params.log_prob_threshold,
+            no_speech_threshold=params.no_speech_threshold,
+            best_of=params.best_of,
+            patience=params.patience,
+            temperature=params.temperature,
+            initial_prompt=params.initial_prompt,
+            compression_ratio_threshold=params.compression_ratio_threshold,
+            length_penalty=params.length_penalty,
+            repetition_penalty=params.repetition_penalty,
+            no_repeat_ngram_size=params.no_repeat_ngram_size,
+            prefix=params.prefix,
+            suppress_blank=params.suppress_blank,
+            suppress_tokens=params.suppress_tokens,
+            max_initial_timestamp=params.max_initial_timestamp,
+            word_timestamps=params.word_timestamps,
+            prepend_punctuations=params.prepend_punctuations,
+            append_punctuations=params.append_punctuations,
+            max_new_tokens=params.max_new_tokens,
+            chunk_length=params.chunk_length,
+            hallucination_silence_threshold=params.hallucination_silence_threshold,
+            hotwords=params.hotwords,
+            language_detection_threshold=params.language_detection_threshold,
+            language_detection_segments=params.language_detection_segments,
+            prompt_reset_on_temperature=params.prompt_reset_on_temperature,
+        )
+        progress(0, desc="Loading audio..")
+        segments_result = []
+        for segment in segments:
+            progress(segment.start / info.duration, desc="Transcribing..")
+            segments_result.append({
+                "start": segment.start,
+                "end": segment.end,
+                "text": segment.text
+            })
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress
+                     ):
+        """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        """
+        progress(0, desc="Initializing Model..")
+        self.current_model_size = self.model_paths[model_size]
+        self.current_compute_type = compute_type
+        self.model = faster_whisper.WhisperModel(
+            device=self.device,
+            model_size_or_path=self.current_model_size,
+            download_root=self.model_dir,
+            compute_type=self.current_compute_type
+        )
+    def get_model_paths(self):
+        """
+        Get available models from models path including fine-tuned model.
+        Returns
+        ----------
+        Name list of models
+        """
+        model_paths = {model:model for model in whisper.available_models()}
+        faster_whisper_prefix = "models--Systran--faster-whisper-"
+        existing_models = os.listdir(self.model_dir)
+        wrong_dirs = [".locks"]
+        existing_models = list(set(existing_models) - set(wrong_dirs))
+        webui_dir = os.getcwd()
+        for model_name in existing_models:
+            if faster_whisper_prefix in model_name:
+                model_name = model_name[len(faster_whisper_prefix):]
+            if model_name not in whisper.available_models():
+                model_paths[model_name] = os.path.join(webui_dir, self.model_dir, model_name)
+        return model_paths
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        else:
+            return "auto"
+    @staticmethod
+    def format_suppress_tokens_str(suppress_tokens_str: str) -> List[int]:
+        try:
+            suppress_tokens = ast.literal_eval(suppress_tokens_str)
+            if not isinstance(suppress_tokens, list) or not all(isinstance(item, int) for item in suppress_tokens):
+                raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
+            return suppress_tokens
+        except Exception as e:
+            raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")

modules/whisper/insanely_fast_whisper_inference.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import os
+import time
+import numpy as np
+from typing import BinaryIO, Union, Tuple, List
+import torch
+from transformers import pipeline
+from transformers.utils import is_flash_attn_2_available
+import gradio as gr
+from huggingface_hub import hf_hub_download
+import whisper
+from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
+from argparse import Namespace
+from modules.whisper.whisper_parameter import *
+from modules.whisper.whisper_base import WhisperBase
+class InsanelyFastWhisperInference(WhisperBase):
+    def __init__(self,
+                 model_dir: str = os.path.join("models", "Whisper", "insanely-fast-whisper"),
+                 diarization_model_dir: str = os.path.join("models", "Diarization"),
+                 output_dir: str = os.path.join("outputs"),
+                 ):
+        super().__init__(
+            model_dir=model_dir,
+            output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir
+        )
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        openai_models = whisper.available_models()
+        distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
+        self.available_models = openai_models + distil_models
+        self.available_compute_types = ["float16"]
+    def transcribe(self,
+                   audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress,
+                   *whisper_params,
+                   ) -> Tuple[List[dict], float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        params = WhisperParameters.as_value(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        progress(0, desc="Transcribing...Progress is not shown in insanely-fast-whisper.")
+        with Progress(
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(style="yellow1", pulse_style="white"),
+                TimeElapsedColumn(),
+        ) as progress:
+            progress.add_task("[yellow]Transcribing...", total=None)
+            segments = self.model(
+                inputs=audio,
+                return_timestamps=True,
+                chunk_length_s=params.chunk_length_s,
+                batch_size=params.batch_size,
+                generate_kwargs={
+                    "language": params.lang,
+                    "task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+                    "no_speech_threshold": params.no_speech_threshold,
+                    "temperature": params.temperature,
+                    "compression_ratio_threshold": params.compression_ratio_threshold
+                }
+            )
+        segments_result = self.format_result(
+            transcribed_result=segments,
+        )
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress,
+                     ):
+        """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        """
+        progress(0, desc="Initializing Model..")
+        model_path = os.path.join(self.model_dir, model_size)
+        if not os.path.isdir(model_path) or not os.listdir(model_path):
+            self.download_model(
+                model_size=model_size,
+                download_root=model_path,
+                progress=progress
+            )
+        self.current_compute_type = compute_type
+        self.current_model_size = model_size
+        self.model = pipeline(
+            "automatic-speech-recognition",
+            model=os.path.join(self.model_dir, model_size),
+            torch_dtype=self.current_compute_type,
+            device=self.device,
+            model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
+        )
+    @staticmethod
+    def format_result(
+        transcribed_result: dict
+    ) -> List[dict]:
+        """
+        Format the transcription result of insanely_fast_whisper as the same with other implementation.
+        Parameters
+        ----------
+        transcribed_result: dict
+            Transcription result of the insanely_fast_whisper
+        Returns
+        ----------
+        result: List[dict]
+            Formatted result as the same with other implementation
+        """
+        result = transcribed_result["chunks"]
+        for item in result:
+            start, end = item["timestamp"][0], item["timestamp"][1]
+            if end is None:
+                end = start
+            item["start"] = start
+            item["end"] = end
+        return result
+    @staticmethod
+    def download_model(
+        model_size: str,
+        download_root: str,
+        progress: gr.Progress
+    ):
+        progress(0, 'Initializing model..')
+        print(f'Downloading {model_size} to "{download_root}"....')
+        os.makedirs(download_root, exist_ok=True)
+        download_list = [
+            "model.safetensors",
+            "config.json",
+            "generation_config.json",
+            "preprocessor_config.json",
+            "tokenizer.json",
+            "tokenizer_config.json",
+            "added_tokens.json",
+            "special_tokens_map.json",
+            "vocab.json",
+        ]
+        if model_size.startswith("distil"):
+            repo_id = f"distil-whisper/{model_size}"
+        else:
+            repo_id = f"openai/whisper-{model_size}"
+        for item in download_list:
+            hf_hub_download(repo_id=repo_id, filename=item, local_dir=download_root)

modules/whisper/whisper_Inference.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import whisper
+import gradio as gr
+import time
+from typing import BinaryIO, Union, Tuple, List
+import numpy as np
+import torch
+import os
+from argparse import Namespace
+from modules.whisper.whisper_base import WhisperBase
+from modules.whisper.whisper_parameter import *
+class WhisperInference(WhisperBase):
+    def __init__(self,
+                 model_dir: str = os.path.join("models", "Whisper"),
+                 diarization_model_dir: str = os.path.join("models", "Diarization"),
+                 output_dir: str = os.path.join("outputs"),
+                 ):
+        super().__init__(
+            model_dir=model_dir,
+            output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir
+        )
+    def transcribe(self,
+                   audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress,
+                   *whisper_params,
+                   ) -> Tuple[List[dict], float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        params = WhisperParameters.as_value(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        def progress_callback(progress_value):
+            progress(progress_value, desc="Transcribing..")
+        segments_result = self.model.transcribe(audio=audio,
+                                                language=params.lang,
+                                                verbose=False,
+                                                beam_size=params.beam_size,
+                                                logprob_threshold=params.log_prob_threshold,
+                                                no_speech_threshold=params.no_speech_threshold,
+                                                task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+                                                fp16=True if params.compute_type == "float16" else False,
+                                                best_of=params.best_of,
+                                                patience=params.patience,
+                                                temperature=params.temperature,
+                                                compression_ratio_threshold=params.compression_ratio_threshold,
+                                                progress_callback=progress_callback,)["segments"]
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress,
+                     ):
+        """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        """
+        progress(0, desc="Initializing Model..")
+        self.current_compute_type = compute_type
+        self.current_model_size = model_size
+        self.model = whisper.load_model(
+            name=model_size,
+            device=self.device,
+            download_root=self.model_dir
+        )

modules/whisper/whisper_base.py ADDED Viewed

	@@ -0,0 +1,436 @@

+import os
+import torch
+import whisper
+import gradio as gr
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Union, Tuple, List
+import numpy as np
+from datetime import datetime
+from faster_whisper.vad import VadOptions
+from dataclasses import astuple
+from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
+from modules.utils.youtube_manager import get_ytdata, get_ytaudio
+from modules.utils.files_manager import get_media_files, format_gradio_files
+from modules.whisper.whisper_parameter import *
+from modules.diarize.diarizer import Diarizer
+from modules.vad.silero_vad import SileroVAD
+class WhisperBase(ABC):
+    def __init__(self,
+                 model_dir: str = os.path.join("models", "Whisper"),
+                 diarization_model_dir: str = os.path.join("models", "Diarization"),
+                 output_dir: str = os.path.join("outputs"),
+                 ):
+        self.model_dir = model_dir
+        self.output_dir = output_dir
+        os.makedirs(self.output_dir, exist_ok=True)
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.diarizer = Diarizer(
+            model_dir=diarization_model_dir
+        )
+        self.vad = SileroVAD()
+        self.model = None
+        self.current_model_size = None
+        self.available_models = whisper.available_models()
+        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
+        self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
+        self.device = self.get_device()
+        self.available_compute_types = ["float16", "float32"]
+        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
+    @abstractmethod
+    def transcribe(self,
+                   audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress,
+                   *whisper_params,
+                   ):
+        """Inference whisper model to transcribe"""
+        pass
+    @abstractmethod
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress
+                     ):
+        """Initialize whisper model"""
+        pass
+    def run(self,
+            audio: Union[str, BinaryIO, np.ndarray],
+            progress: gr.Progress,
+            *whisper_params,
+            ) -> Tuple[List[dict], float]:
+        """
+        Run transcription with conditional pre-processing and post-processing.
+        The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
+        The diarization will be performed in post-processing, if enabled.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio input. This can be file path or binary type.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for running
+        """
+        params = WhisperParameters.as_value(*whisper_params)
+        if params.lang == "Automatic Detection":
+            params.lang = None
+        else:
+            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            params.lang = language_code_dict[params.lang]
+        speech_chunks = None
+        if params.vad_filter:
+            # Explicit value set for float('inf') from gr.Number()
+            if params.max_speech_duration_s >= 9999:
+                params.max_speech_duration_s = float('inf')
+            vad_options = VadOptions(
+                threshold=params.threshold,
+                min_speech_duration_ms=params.min_speech_duration_ms,
+                max_speech_duration_s=params.max_speech_duration_s,
+                min_silence_duration_ms=params.min_silence_duration_ms,
+                speech_pad_ms=params.speech_pad_ms
+            )
+            audio, speech_chunks = self.vad.run(
+                audio=audio,
+                vad_parameters=vad_options,
+                progress=progress
+            )
+        result, elapsed_time = self.transcribe(
+            audio,
+            progress,
+            *astuple(params)
+        )
+        if params.vad_filter:
+            result = self.vad.restore_speech_timestamps(
+                segments=result,
+                speech_chunks=speech_chunks,
+            )
+        if params.is_diarize:
+            result, elapsed_time_diarization = self.diarizer.run(
+                audio=audio,
+                use_auth_token=params.hf_token,
+                transcribed_result=result,
+            )
+            elapsed_time += elapsed_time_diarization
+        return result, elapsed_time
+    def transcribe_file(self,
+                        files: list,
+                        input_folder_path: str,
+                        file_format: str,
+                        add_timestamp: bool,
+                        progress=gr.Progress(),
+                        *whisper_params,
+                        ) -> list:
+        """
+        Write subtitle file from Files
+        Parameters
+        ----------
+        files: list
+            List of files to transcribe from gr.Files()
+        input_folder_path: str
+            Input folder path to transcribe from gr.Textbox(). If this is provided, `files` will be ignored and
+            this will be used instead.
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            if input_folder_path:
+                files = get_media_files(input_folder_path)
+                files = format_gradio_files(files)
+            files_info = {}
+            for file in files:
+                transcribed_segments, time_for_task = self.run(
+                    file.name,
+                    progress,
+                    *whisper_params,
+                )
+                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
+                subtitle, file_path = self.generate_and_write_file(
+                    file_name=file_name,
+                    transcribed_segments=transcribed_segments,
+                    add_timestamp=add_timestamp,
+                    file_format=file_format,
+                    output_dir=self.output_dir
+                )
+                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
+            total_result = ''
+            total_time = 0
+            for file_name, info in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{info["subtitle"]}'
+                total_time += info["time_for_task"]
+            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            result_file_path = [info['path'] for info in files_info.values()]
+            return [result_str, result_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+        finally:
+            self.release_cuda_memory()
+            if not files:
+                self.remove_input_files([file.name for file in files])
+    def transcribe_mic(self,
+                       mic_audio: str,
+                       file_format: str,
+                       progress=gr.Progress(),
+                       *whisper_params,
+                       ) -> list:
+        """
+        Write subtitle file from microphone
+        Parameters
+        ----------
+        mic_audio: str
+            Audio file path from gr.Microphone()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            progress(0, desc="Loading Audio..")
+            transcribed_segments, time_for_task = self.run(
+                mic_audio,
+                progress,
+                *whisper_params,
+            )
+            progress(1, desc="Completed!")
+            subtitle, result_file_path = self.generate_and_write_file(
+                file_name="Mic",
+                transcribed_segments=transcribed_segments,
+                add_timestamp=True,
+                file_format=file_format,
+                output_dir=self.output_dir
+            )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([mic_audio])
+    def transcribe_youtube(self,
+                           youtube_link: str,
+                           file_format: str,
+                           add_timestamp: bool,
+                           progress=gr.Progress(),
+                           *whisper_params,
+                           ) -> list:
+        """
+        Write subtitle file from Youtube
+        Parameters
+        ----------
+        youtube_link: str
+            URL of the Youtube video to transcribe from gr.Textbox()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtube_link)
+            audio = get_ytaudio(yt)
+            transcribed_segments, time_for_task = self.run(
+                audio,
+                progress,
+                *whisper_params,
+            )
+            progress(1, desc="Completed!")
+            file_name = safe_filename(yt.title)
+            subtitle, result_file_path = self.generate_and_write_file(
+                file_name=file_name,
+                transcribed_segments=transcribed_segments,
+                add_timestamp=add_timestamp,
+                file_format=file_format,
+                output_dir=self.output_dir
+            )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+        finally:
+            try:
+                if 'yt' not in locals():
+                    yt = get_ytdata(youtube_link)
+                    file_path = get_ytaudio(yt)
+                else:
+                    file_path = get_ytaudio(yt)
+                self.release_cuda_memory()
+                self.remove_input_files([file_path])
+            except Exception as cleanup_error:
+                pass
+    @staticmethod
+    def generate_and_write_file(file_name: str,
+                                transcribed_segments: list,
+                                add_timestamp: bool,
+                                file_format: str,
+                                output_dir: str
+                                ) -> str:
+        """
+        Writes subtitle file
+        Parameters
+        ----------
+        file_name: str
+            Output file name
+        transcribed_segments: list
+            Text segments transcribed from audio
+        add_timestamp: bool
+            Determines whether to add a timestamp to the end of the filename.
+        file_format: str
+            File format to write. Supported formats: [SRT, WebVTT, txt]
+        output_dir: str
+            Directory path of the output
+        Returns
+        ----------
+        content: str
+            Result of the transcription
+        output_path: str
+            output file path
+        """
+        if add_timestamp:
+            timestamp = datetime.now().strftime("%m%d%H%M%S")
+            output_path = os.path.join(output_dir, f"{file_name}-{timestamp}")
+        else:
+            output_path = os.path.join(output_dir, f"{file_name}")
+        if file_format == "SRT":
+            content = get_srt(transcribed_segments)
+            output_path += '.srt'
+        elif file_format == "WebVTT":
+            content = get_vtt(transcribed_segments)
+            output_path += '.vtt'
+        elif file_format == "txt":
+            content = get_txt(transcribed_segments)
+            output_path += '.txt'
+        write_file(content, output_path)
+        return content, output_path
+    @staticmethod
+    def format_time(elapsed_time: float) -> str:
+        """
+        Get {hours} {minutes} {seconds} time format string
+        Parameters
+        ----------
+        elapsed_time: str
+            Elapsed time for transcription
+        Returns
+        ----------
+        Time format string
+        """
+        hours, rem = divmod(elapsed_time, 3600)
+        minutes, seconds = divmod(rem, 60)
+        time_str = ""
+        if hours:
+            time_str += f"{hours} hours "
+        if minutes:
+            time_str += f"{minutes} minutes "
+        seconds = round(seconds)
+        time_str += f"{seconds} seconds"
+        return time_str.strip()
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def release_cuda_memory():
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+    @staticmethod
+    def remove_input_files(file_paths: List[str]):
+        if not file_paths:
+            return
+        for file_path in file_paths:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)

modules/whisper/whisper_factory.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import Optional
+import os
+from modules.whisper.faster_whisper_inference import FasterWhisperInference
+from modules.whisper.whisper_Inference import WhisperInference
+from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
+from modules.whisper.whisper_base import WhisperBase
+class WhisperFactory:
+    @staticmethod
+    def create_whisper_inference(
+        whisper_type: str,
+        whisper_model_dir: str = os.path.join("models", "Whisper"),
+        faster_whisper_model_dir: str = os.path.join("models", "Whisper", "faster-whisper"),
+        insanely_fast_whisper_model_dir: str = os.path.join("models", "Whisper", "insanely-fast-whisper"),
+        diarization_model_dir: str = os.path.join("models", "Diarization"),
+        output_dir: str = os.path.join("outputs"),
+    ) -> "WhisperBase":
+        """
+        Create a whisper inference class based on the provided whisper_type.
+        Parameters
+        ----------
+        whisper_type : str
+            The type of Whisper implementation to use. Supported values (case-insensitive):
+            - "faster-whisper": https://github.com/openai/whisper
+            - "whisper": https://github.com/openai/whisper
+            - "insanely-fast-whisper": https://github.com/Vaibhavs10/insanely-fast-whisper
+        whisper_model_dir : str
+            Directory path for the Whisper model.
+        faster_whisper_model_dir : str
+            Directory path for the Faster Whisper model.
+        insanely_fast_whisper_model_dir : str
+            Directory path for the Insanely Fast Whisper model.
+        diarization_model_dir : str
+            Directory path for the diarization model.
+        output_dir : str
+            Directory path where output files will be saved.
+        Returns
+        -------
+        WhisperBase
+            An instance of the appropriate whisper inference class based on the whisper_type.
+        """
+        # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
+        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+        whisper_type = whisper_type.lower().strip()
+        faster_whisper_typos = ["faster_whisper", "faster-whisper", "fasterwhisper"]
+        whisper_typos = ["whisper"]
+        insanely_fast_whisper_typos = [
+            "insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
+            "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"
+        ]
+        if whisper_type in faster_whisper_typos:
+            return FasterWhisperInference(
+                model_dir=faster_whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir
+            )
+        elif whisper_type in whisper_typos:
+            return WhisperInference(
+                model_dir=whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir
+            )
+        elif whisper_type in insanely_fast_whisper_typos:
+            return InsanelyFastWhisperInference(
+                model_dir=insanely_fast_whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir
+            )
+        else:
+            return FasterWhisperInference(
+                model_dir=faster_whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir
+            )

modules/whisper/whisper_parameter.py ADDED Viewed

	@@ -0,0 +1,277 @@

+from dataclasses import dataclass, fields
+import gradio as gr
+from typing import Optional
+@dataclass
+class WhisperParameters:
+    model_size: gr.Dropdown
+    lang: gr.Dropdown
+    is_translate: gr.Checkbox
+    beam_size: gr.Number
+    log_prob_threshold: gr.Number
+    no_speech_threshold: gr.Number
+    compute_type: gr.Dropdown
+    best_of: gr.Number
+    patience: gr.Number
+    condition_on_previous_text: gr.Checkbox
+    prompt_reset_on_temperature: gr.Slider
+    initial_prompt: gr.Textbox
+    temperature: gr.Slider
+    compression_ratio_threshold: gr.Number
+    vad_filter: gr.Checkbox
+    threshold: gr.Slider
+    min_speech_duration_ms: gr.Number
+    max_speech_duration_s: gr.Number
+    min_silence_duration_ms: gr.Number
+    speech_pad_ms: gr.Number
+    chunk_length_s: gr.Number
+    batch_size: gr.Number
+    is_diarize: gr.Checkbox
+    hf_token: gr.Textbox
+    diarization_device: gr.Dropdown
+    length_penalty: gr.Number
+    repetition_penalty: gr.Number
+    no_repeat_ngram_size: gr.Number
+    prefix: gr.Textbox
+    suppress_blank: gr.Checkbox
+    suppress_tokens: gr.Textbox
+    max_initial_timestamp: gr.Number
+    word_timestamps: gr.Checkbox
+    prepend_punctuations: gr.Textbox
+    append_punctuations: gr.Textbox
+    max_new_tokens: gr.Number
+    chunk_length: gr.Number
+    hallucination_silence_threshold: gr.Number
+    hotwords: gr.Textbox
+    language_detection_threshold: gr.Number
+    language_detection_segments: gr.Number
+    """
+    A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
+    This data class is used to mitigate the key-value problem between Gradio components and function parameters.
+    Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
+    See more about Gradio pre-processing: https://www.gradio.app/docs/components
+    Attributes
+    ----------
+    model_size: gr.Dropdown
+        Whisper model size.
+    lang: gr.Dropdown
+        Source language of the file to transcribe.
+    is_translate: gr.Checkbox
+        Boolean value that determines whether to translate to English.
+        It's Whisper's feature to translate speech from another language directly into English end-to-end.
+    beam_size: gr.Number
+        Int value that is used for decoding option.
+    log_prob_threshold: gr.Number
+        If the average log probability over sampled tokens is below this value, treat as failed.
+    no_speech_threshold: gr.Number
+        If the no_speech probability is higher than this value AND
+        the average log probability over sampled tokens is below `log_prob_threshold`,
+        consider the segment as silent.
+    compute_type: gr.Dropdown
+        compute type for transcription.
+        see more info : https://opennmt.net/CTranslate2/quantization.html
+    best_of: gr.Number
+        Number of candidates when sampling with non-zero temperature.
+    patience: gr.Number
+        Beam search patience factor.
+    condition_on_previous_text: gr.Checkbox
+        if True, the previous output of the model is provided as a prompt for the next window;
+        disabling may make the text inconsistent across windows, but the model becomes less prone to
+        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
+    initial_prompt: gr.Textbox
+        Optional text to provide as a prompt for the first window. This can be used to provide, or
+        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
+        to make it more likely to predict those word correctly.
+    temperature: gr.Slider
+        Temperature for sampling. It can be a tuple of temperatures,
+        which will be successively used upon failures according to either
+        `compression_ratio_threshold` or `log_prob_threshold`.
+    compression_ratio_threshold: gr.Number
+        If the gzip compression ratio is above this value, treat as failed
+    vad_filter: gr.Checkbox
+        Enable the voice activity detection (VAD) to filter out parts of the audio
+        without speech. This step is using the Silero VAD model
+        https://github.com/snakers4/silero-vad.
+    threshold: gr.Slider
+        This parameter is related with Silero VAD. Speech threshold.
+        Silero VAD outputs speech probabilities for each audio chunk,
+        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
+        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
+    min_speech_duration_ms: gr.Number
+        This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
+    max_speech_duration_s: gr.Number
+        This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
+        than max_speech_duration_s will be split at the timestamp of the last silence that
+        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
+        split aggressively just before max_speech_duration_s.
+    min_silence_duration_ms: gr.Number
+        This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
+        before separating it
+    speech_pad_ms: gr.Number
+        This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
+    chunk_length_s: gr.Number
+        This parameter is related with insanely-fast-whisper pipe.
+        Maximum length of each chunk
+    batch_size: gr.Number
+        This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
+    is_diarize: gr.Checkbox
+        This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
+    hf_token: gr.Textbox
+        This parameter is related with whisperx. Huggingface token is needed to download diarization models.
+        Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
+    diarization_device: gr.Dropdown
+        This parameter is related with whisperx. Device to run diarization model
+    length_penalty:
+        This parameter is related to faster-whisper. Exponential length penalty constant.
+    repetition_penalty:
+        This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
+        (set > 1 to penalize).
+    no_repeat_ngram_size:
+        This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
+    prefix:
+        This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
+    suppress_blank:
+        This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
+    suppress_tokens:
+        This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
+        of symbols as defined in the model config.json file.
+    max_initial_timestamp:
+        This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
+    word_timestamps:
+        This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
+        and dynamic time warping, and include the timestamps for each word in each segment.
+    prepend_punctuations:
+        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
+        with the next word.
+    append_punctuations:
+        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
+        with the previous word.
+    max_new_tokens:
+        This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
+        the maximum will be set by the default max_length.
+    chunk_length:
+        This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
+        default chunk_length of the FeatureExtractor.
+    hallucination_silence_threshold:
+        This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
+        (in seconds) when a possible hallucination is detected.
+    hotwords:
+        This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
+    language_detection_threshold:
+        This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
+    language_detection_segments:
+        This parameter is related to faster-whisper. Number of segments to consider for the language detection.
+    """
+    def as_list(self) -> list:
+        """
+        Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
+        See more about Gradio pre-processing: : https://www.gradio.app/docs/components
+        Returns
+        ----------
+        A list of Gradio components
+        """
+        return [getattr(self, f.name) for f in fields(self)]
+    @staticmethod
+    def as_value(*args) -> 'WhisperValues':
+        """
+        To use Whisper parameters in function after Gradio post-processing.
+        See more about Gradio post-processing: : https://www.gradio.app/docs/components
+        Returns
+        ----------
+        WhisperValues
+           Data class that has values of parameters
+        """
+        return WhisperValues(*args)
+@dataclass
+class WhisperValues:
+    model_size: str
+    lang: str
+    is_translate: bool
+    beam_size: int
+    log_prob_threshold: float
+    no_speech_threshold: float
+    compute_type: str
+    best_of: int
+    patience: float
+    condition_on_previous_text: bool
+    prompt_reset_on_temperature: float
+    initial_prompt: Optional[str]
+    temperature: float
+    compression_ratio_threshold: float
+    vad_filter: bool
+    threshold: float
+    min_speech_duration_ms: int
+    max_speech_duration_s: float
+    min_silence_duration_ms: int
+    speech_pad_ms: int
+    chunk_length_s: int
+    batch_size: int
+    is_diarize: bool
+    hf_token: str
+    diarization_device: str
+    length_penalty: float
+    repetition_penalty: float
+    no_repeat_ngram_size: int
+    prefix: Optional[str]
+    suppress_blank: bool
+    suppress_tokens: Optional[str]
+    max_initial_timestamp: float
+    word_timestamps: bool
+    prepend_punctuations: Optional[str]
+    append_punctuations: Optional[str]
+    max_new_tokens: Optional[int]
+    chunk_length: Optional[int]
+    hallucination_silence_threshold: Optional[float]
+    hotwords: Optional[str]
+    language_detection_threshold: Optional[float]
+    language_detection_segments: int
+    """
+    A data class to use Whisper parameters.
+    """

outputs/outputs are saved here.txt ADDED Viewed

File without changes

outputs/translations/outputs for translation are saved here.txt ADDED Viewed

File without changes