Spaces:

pragyaa
/

pragyaa-app

No application file

App Files Files Community

pragyaa commited on Apr 27

Commit

0874a62

•

1 Parent(s): b1c3d0e

Upload 10 files

Browse files

Files changed (10) hide show

__init__.py +0 -0
download.py +79 -0
modelCache.py +17 -0
segments.py +55 -0
source.py +70 -0
utils-original.py +115 -0
utils.py +129 -0
vad.py +537 -0
vadParallel.py +255 -0
whisperContainer.py +127 -0

__init__.py ADDED Viewed

File without changes

download.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from tempfile import mkdtemp
+from typing import List
+from yt_dlp import YoutubeDL
+import yt_dlp
+from yt_dlp.postprocessor import PostProcessor
+class FilenameCollectorPP(PostProcessor):
+    def __init__(self):
+        super(FilenameCollectorPP, self).__init__(None)
+        self.filenames = []
+    def run(self, information):
+        self.filenames.append(information["filepath"])
+        return [], information
+def download_url(url: str, maxDuration: int = None, destinationDirectory: str = None, playlistItems: str = "1") -> List[str]:
+    try:
+        return _perform_download(url, maxDuration=maxDuration, outputTemplate=None, destinationDirectory=destinationDirectory, playlistItems=playlistItems)
+    except yt_dlp.utils.DownloadError as e:
+        # In case of an OS error, try again with a different output template
+        if e.msg and e.msg.find("[Errno 36] File name too long") >= 0:
+            return _perform_download(url, maxDuration=maxDuration, outputTemplate="%(title).10s %(id)s.%(ext)s")
+        pass
+def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = None, destinationDirectory: str = None, playlistItems: str = "1"):
+    # Create a temporary directory to store the downloaded files
+    if destinationDirectory is None:
+        destinationDirectory = mkdtemp()
+    ydl_opts = {
+        "format": "bestaudio/best",
+        'outtmpl':destinationDirectory+'1.wav',
+        'paths': {
+            'home': destinationDirectory
+        }
+    }
+    if (playlistItems):
+        ydl_opts['playlist_items'] = playlistItems
+    # Add output template if specified
+    if outputTemplate:
+        ydl_opts['outtmpl'] = outputTemplate
+    filename_collector = FilenameCollectorPP()
+    with YoutubeDL(ydl_opts) as ydl:
+        if maxDuration and maxDuration > 0:
+            info = ydl.extract_info(url, download=False)
+            entries = "entries" in info and info["entries"] or [info]
+            total_duration = 0
+            # Compute total duration
+            for entry in entries:
+                total_duration += float(entry["duration"])
+            if total_duration >= maxDuration:
+                raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=maxDuration, message="Video is too long")
+        ydl.add_post_processor(filename_collector)
+        ydl.download([url])
+    if len(filename_collector.filenames) <= 0:
+        raise Exception("Cannot download " + url)
+    result = []
+    for filename in filename_collector.filenames:
+        result.append(filename)
+        print("Downloaded " + filename)
+    return result
+class ExceededMaximumDuration(Exception):
+    def __init__(self, videoDuration, maxDuration, message):
+        self.videoDuration = videoDuration
+        self.maxDuration = maxDuration
+        super().__init__(message)

modelCache.py ADDED Viewed

	@@ -0,0 +1,17 @@

+class ModelCache:
+    def __init__(self):
+        self._cache = dict()
+    def get(self, model_key: str, model_factory):
+        result = self._cache.get(model_key)
+        if result is None:
+            result = model_factory()
+            self._cache[model_key] = result
+        return result
+    def clear(self):
+        self._cache.clear()
+# A global cache of models. This is mainly used by the daemon processes to avoid loading the same model multiple times.
+GLOBAL_MODEL_CACHE = ModelCache()

segments.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from typing import Any, Dict, List
+import copy
+def merge_timestamps(timestamps: List[Dict[str, Any]], merge_window: float = 5, max_merge_size: float = 30, padding_left: float = 1, padding_right: float = 1):
+    result = []
+    if len(timestamps) == 0:
+        return result
+    if max_merge_size is None:
+        return timestamps
+    if padding_left is None:
+        padding_left = 0
+    if padding_right is None:
+        padding_right = 0
+    processed_time = 0
+    current_segment = None
+    for i in range(len(timestamps)):
+        next_segment = timestamps[i]
+        delta = next_segment['start'] - processed_time
+        # Note that segments can still be longer than the max merge size, they just won't be merged in that case
+        if current_segment is None or (merge_window is not None and delta > merge_window) \
+                 or next_segment['end'] - current_segment['start'] > max_merge_size:
+            # Finish the current segment
+            if current_segment is not None:
+                # Add right padding
+                finish_padding = min(padding_right, delta / 2) if delta < padding_left + padding_right else padding_right
+                current_segment['end'] += finish_padding
+                delta -= finish_padding
+                result.append(current_segment)
+            # Start a new segment
+            current_segment = copy.deepcopy(next_segment)
+            # Pad the segment
+            current_segment['start'] = current_segment['start'] - min(padding_left, delta)
+            processed_time = current_segment['end']
+        else:
+            # Merge the segment
+            current_segment['end'] = next_segment['end']
+            processed_time = current_segment['end']
+    # Add the last segment
+    if current_segment is not None:
+        current_segment['end'] += padding_right
+        result.append(current_segment)
+    return result

source.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
+import os
+import pathlib
+from typing import List
+import zipfile
+import ffmpeg
+from more_itertools import unzip
+from src.download import ExceededMaximumDuration, download_url
+MAX_FILE_PREFIX_LENGTH = 17
+class AudioSource:
+    def __init__(self, source_path, source_name = None):
+        self.source_path = source_path
+        self.source_name = source_name
+        # Load source name if not provided
+        if (self.source_name is None):
+            file_path = pathlib.Path(self.source_path)
+            self.source_name = file_path.name
+    def get_full_name(self):
+        return self.source_name
+    def get_short_name(self, max_length: int = MAX_FILE_PREFIX_LENGTH):
+        file_path = pathlib.Path(self.source_name)
+        short_name = file_path.stem[:max_length] + file_path.suffix
+        return short_name
+    def __str__(self) -> str:
+        return self.source_path
+class AudioSourceCollection:
+    def __init__(self, sources: List[AudioSource]):
+        self.sources = sources
+    def __iter__(self):
+        return iter(self.sources)
+def get_audio_source_collection(urlData: str, multipleFiles: List, microphoneData: str, input_audio_max_duration: float = -1) -> List[AudioSource]:
+    output: List[AudioSource] = []
+    if urlData:
+        # Download from YouTube. This could also be a playlist or a channel.
+        output.extend([ AudioSource(x) for x in download_url(urlData, input_audio_max_duration, playlistItems=None) ])
+    else:
+        # Add input files
+        if (multipleFiles is not None):
+            output.extend([ AudioSource(x.name) for x in multipleFiles ])
+        if (microphoneData is not None):
+            output.append(AudioSource(microphoneData))
+        total_duration = 0
+        # Calculate total audio length. We do this even if input_audio_max_duration
+        # is disabled to ensure that all the audio files are valid.
+        for source in output:
+            audioDuration = ffmpeg.probe(source.source_path)["format"]["duration"]
+            total_duration += float(audioDuration)
+        # Ensure the total duration of the audio is not too long
+        if input_audio_max_duration > 0:
+            if float(total_duration) > input_audio_max_duration:
+                raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=input_audio_max_duration, message="Video(s) is too long")
+    # Return a list of audio sources
+    return output

utils-original.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import textwrap
+import unicodedata
+import re
+import zlib
+from typing import Iterator, TextIO
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+def optional_int(string):
+    return None if string == "None" else int(string)
+def optional_float(string):
+    return None if string == "None" else float(string)
+def compression_ratio(text) -> float:
+    return len(text) / len(zlib.compress(text.encode("utf-8")))
+def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
+def write_txt(transcript: Iterator[dict], file: TextIO):
+    for segment in transcript:
+        print(segment['text'].strip(), file=file, flush=True)
+def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    print("WEBVTT\n", file=file)
+    for segment in transcript:
+        text = process_text(segment['text'], maxLineWidth).replace('-->', '->')
+        print(
+            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    """
+    Write a transcript to a file in SRT format.
+    Example usage:
+        from pathlib import Path
+        from whisper.utils import write_srt
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        # save SRT
+        audio_basename = Path(audio_path).stem
+        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+            write_srt(result["segments"], file=srt)
+    """
+    for i, segment in enumerate(transcript, start=1):
+        text = process_text(segment['text'].strip(), maxLineWidth).replace('-->', '->')
+        # write srt lines
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
+            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def process_text(text: str, maxLineWidth=None):
+    if (maxLineWidth is None or maxLineWidth < 0):
+        return text
+    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
+    return '\n'.join(lines)
+def slugify(value, allow_unicode=False):
+    """
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren't alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    """
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize('NFKC', value)
+    else:
+        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    value = re.sub(r'[^\w\s-]', '', value.lower())
+    return re.sub(r'[-\s]+', '-', value).strip('-_')

utils.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import textwrap
+import unicodedata
+import re
+import zlib
+from typing import Iterator, TextIO
+import audioread
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+def duration_detector(path):
+    length = 0
+    with audioread.audio_open(path) as f:
+      length = int(f.duration)
+    hours = length // 3600  # calculate in hours
+    length %= 3600
+    mins = length // 60  # calculate in minutes
+    length %= 60
+    seconds = length  # calculate in seconds
+    print('Total Duration: {}:{}:{}:{}'.format(path,hours, mins, seconds))
+    #return "{}:{}:{}".format(hours, mins, seconds)
+    return hours,mins,seconds
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+def optional_int(string):
+    return None if string == "None" else int(string)
+def optional_float(string):
+    return None if string == "None" else float(string)
+def compression_ratio(text) -> float:
+    return len(text) / len(zlib.compress(text.encode("utf-8")))
+def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
+def write_txt(transcript: Iterator[dict], file: TextIO):
+    for segment in transcript:
+        print(segment['text'].strip(), file=file, flush=True)
+def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    print("WEBVTT\n", file=file)
+    for segment in transcript:
+        text = process_text(segment['text'], maxLineWidth).replace('-->', '->')
+        print(
+            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    """
+    Write a transcript to a file in SRT format.
+    Example usage:
+        from pathlib import Path
+        from whisper.utils import write_srt
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        # save SRT
+        audio_basename = Path(audio_path).stem
+        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+            write_srt(result["segments"], file=srt)
+    """
+    for i, segment in enumerate(transcript, start=1):
+        text = process_text(segment['text'].strip(), maxLineWidth).replace('-->', '->')
+        # write srt lines
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
+            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def process_text(text: str, maxLineWidth=None):
+    if (maxLineWidth is None or maxLineWidth < 0):
+        return text
+    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
+    return '\n'.join(lines)
+def slugify(value, allow_unicode=False):
+    """
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren't alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    """
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize('NFKC', value)
+    else:
+        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    value = re.sub(r'[^\w\s-]', '', value.lower())
+    return re.sub(r'[-\s]+', '-', value).strip('-_')

vad.py ADDED Viewed

	@@ -0,0 +1,537 @@

+from abc import ABC, abstractmethod
+from collections import Counter, deque
+import time
+from typing import Any, Deque, Iterator, List, Dict
+from pprint import pprint
+from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
+from src.segments import merge_timestamps
+from src.whisperContainer import WhisperCallback
+# Workaround for https://github.com/tensorflow/tensorflow/issues/48797
+try:
+    import tensorflow as tf
+except ModuleNotFoundError:
+    # Error handling
+    pass
+import torch
+import ffmpeg
+import numpy as np
+from src.utils import format_timestamp
+from enum import Enum
+class NonSpeechStrategy(Enum):
+    """
+    Ignore non-speech frames segments.
+    """
+    SKIP = 1
+    """
+    Just treat non-speech segments as speech.
+    """
+    CREATE_SEGMENT = 2
+    """
+    Expand speech segments into subsequent non-speech segments.
+    """
+    EXPAND_SEGMENT = 3
+# Defaults for Silero
+SPEECH_TRESHOLD = 0.3
+# Minimum size of segments to process
+MIN_SEGMENT_DURATION = 1
+# The maximum time for texts from old segments to be used in the next segment
+MAX_PROMPT_WINDOW = 0 # seconds (0 = disabled)
+PROMPT_NO_SPEECH_PROB = 0.1 # Do not pass the text from segments with a no speech probability higher than this
+VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
+class TranscriptionConfig(ABC):
+    def __init__(self, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
+                       segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None, initial_segment_index = -1):
+        self.non_speech_strategy = non_speech_strategy
+        self.segment_padding_left = segment_padding_left
+        self.segment_padding_right = segment_padding_right
+        self.max_silent_period = max_silent_period
+        self.max_merge_size = max_merge_size
+        self.max_prompt_window = max_prompt_window
+        self.initial_segment_index = initial_segment_index
+class PeriodicTranscriptionConfig(TranscriptionConfig):
+    def __init__(self, periodic_duration: float, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
+                       segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None, initial_segment_index = -1):
+        super().__init__(non_speech_strategy, segment_padding_left, segment_padding_right, max_silent_period, max_merge_size, max_prompt_window, initial_segment_index)
+        self.periodic_duration = periodic_duration
+class AbstractTranscription(ABC):
+    def __init__(self, sampling_rate: int = 16000):
+        self.sampling_rate = sampling_rate
+    def get_audio_segment(self, str, start_time: str = None, duration: str = None):
+        return load_audio(str, self.sampling_rate, start_time, duration)
+    def is_transcribe_timestamps_fast(self):
+        """
+        Determine if get_transcribe_timestamps is fast enough to not need parallelization.
+        """
+        return False
+    @abstractmethod
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, end_time: float):
+        """
+        Get the start and end timestamps of the sections that should be transcribed by this VAD method.
+        Parameters
+        ----------
+        audio: str
+            The audio file.
+        config: TranscriptionConfig
+            The transcription configuration.
+        Returns
+        -------
+        A list of start and end timestamps, in fractional seconds.
+        """
+        return
+    def get_merged_timestamps(self, timestamps: List[Dict[str, Any]], config: TranscriptionConfig, total_duration: float):
+        """
+        Get the start and end timestamps of the sections that should be transcribed by this VAD method,
+        after merging the given segments using the specified configuration.
+        Parameters
+        ----------
+        audio: str
+            The audio file.
+        config: TranscriptionConfig
+            The transcription configuration.
+        Returns
+        -------
+        A list of start and end timestamps, in fractional seconds.
+        """
+        merged = merge_timestamps(timestamps, config.max_silent_period, config.max_merge_size,
+                                  config.segment_padding_left, config.segment_padding_right)
+        if config.non_speech_strategy != NonSpeechStrategy.SKIP:
+            # Expand segments to include the gaps between them
+            if (config.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
+                # When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
+                merged = self.fill_gaps(merged, total_duration=total_duration, max_expand_size=config.max_merge_size)
+            elif config.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
+                # With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
+                merged = self.expand_gaps(merged, total_duration=total_duration)
+            else:
+                raise Exception("Unknown non-speech strategy: " + str(config.non_speech_strategy))
+            print("Transcribing non-speech:")
+            pprint(merged)
+        return merged
+    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig):
+        """
+        Transcribe the given audo file.
+        Parameters
+        ----------
+        audio: str
+            The audio file.
+        whisperCallable: WhisperCallback
+            A callback object to call to transcribe each segment.
+        Returns
+        -------
+        A list of start and end timestamps, in fractional seconds.
+        """
+        max_audio_duration = get_audio_duration(audio)
+        timestamp_segments = self.get_transcribe_timestamps(audio, config, 0, max_audio_duration)
+        # Get speech timestamps from full audio file
+        merged = self.get_merged_timestamps(timestamp_segments, config, max_audio_duration)
+        # A deque of transcribed segments that is passed to the next segment as a prompt
+        prompt_window = deque()
+        print("Processing timestamps:")
+        pprint(merged)
+        result = {
+            'text': "",
+            'segments': [],
+            'language': ""
+        }
+        languageCounter = Counter()
+        detected_language = None
+        segment_index = config.initial_segment_index
+        # For each time segment, run whisper
+        for segment in merged:
+            segment_index += 1
+            segment_start = segment['start']
+            segment_end = segment['end']
+            segment_expand_amount = segment.get('expand_amount', 0)
+            segment_gap = segment.get('gap', False)
+            segment_duration = segment_end - segment_start
+            if segment_duration < MIN_SEGMENT_DURATION:
+                continue;
+            # Audio to run on Whisper
+            segment_audio = self.get_audio_segment(audio, start_time = str(segment_start), duration = str(segment_duration))
+            # Previous segments to use as a prompt
+            segment_prompt = ' '.join([segment['text'] for segment in prompt_window]) if len(prompt_window) > 0 else None
+            # Detected language
+            detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
+            print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
+                  segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
+            segment_result = whisperCallable.invoke(segment_audio, segment_index, segment_prompt, detected_language)
+            adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
+            # Propagate expand amount to the segments
+            if (segment_expand_amount > 0):
+                segment_without_expansion = segment_duration - segment_expand_amount
+                for adjusted_segment in adjusted_segments:
+                    adjusted_segment_end = adjusted_segment['end']
+                    # Add expand amount if the segment got expanded
+                    if (adjusted_segment_end > segment_without_expansion):
+                        adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
+            # Append to output
+            result['text'] += segment_result['text']
+            result['segments'].extend(adjusted_segments)
+            # Increment detected language
+            if not segment_gap:
+                languageCounter[segment_result['language']] += 1
+            # Update prompt window
+            self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap, config)
+        if detected_language is not None:
+            result['language'] = detected_language
+        return result
+    def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float, segment_gap: bool, config: TranscriptionConfig):
+        if (config.max_prompt_window is not None and config.max_prompt_window > 0):
+            # Add segments to the current prompt window (unless it is a speech gap)
+            if not segment_gap:
+                for segment in adjusted_segments:
+                    if segment.get('no_speech_prob', 0) <= PROMPT_NO_SPEECH_PROB:
+                        prompt_window.append(segment)
+            while (len(prompt_window) > 0):
+                first_end_time = prompt_window[0].get('end', 0)
+                # Time expanded in the segments should be discounted from the prompt window
+                first_expand_time = prompt_window[0].get('expand_amount', 0)
+                if (first_end_time - first_expand_time < segment_end - config.max_prompt_window):
+                    prompt_window.popleft()
+                else:
+                    break
+    def include_gaps(self, segments: Iterator[dict], min_gap_length: float, total_duration: float):
+        result = []
+        last_end_time = 0
+        for segment in segments:
+            segment_start = float(segment['start'])
+            segment_end = float(segment['end'])
+            if (last_end_time != segment_start):
+                delta = segment_start - last_end_time
+                if (min_gap_length is None or delta >= min_gap_length):
+                    result.append( { 'start': last_end_time, 'end': segment_start, 'gap': True } )
+            last_end_time = segment_end
+            result.append(segment)
+        # Also include total duration if specified
+        if (total_duration is not None and last_end_time < total_duration):
+            delta = total_duration - segment_start
+            if (min_gap_length is None or delta >= min_gap_length):
+                result.append( { 'start': last_end_time, 'end': total_duration, 'gap': True } )
+        return result
+    # Expand the end time of each segment to the start of the next segment
+    def expand_gaps(self, segments: List[Dict[str, Any]], total_duration: float):
+        result = []
+        if len(segments) == 0:
+            return result
+        # Add gap at the beginning if needed
+        if (segments[0]['start'] > 0):
+            result.append({ 'start': 0, 'end': segments[0]['start'], 'gap': True } )
+        for i in range(len(segments) - 1):
+            current_segment = segments[i]
+            next_segment = segments[i + 1]
+            delta = next_segment['start'] - current_segment['end']
+            # Expand if the gap actually exists
+            if (delta >= 0):
+                current_segment = current_segment.copy()
+                current_segment['expand_amount'] = delta
+                current_segment['end'] = next_segment['start']
+            result.append(current_segment)
+        # Add last segment
+        last_segment = segments[-1]
+        result.append(last_segment)
+        # Also include total duration if specified
+        if (total_duration is not None):
+            last_segment = result[-1]
+            if (last_segment['end'] < total_duration):
+                last_segment = last_segment.copy()
+                last_segment['end'] = total_duration
+                result[-1] = last_segment
+        return result
+    def fill_gaps(self, segments: List[Dict[str, Any]], total_duration: float, max_expand_size: float = None):
+        result = []
+        if len(segments) == 0:
+            return result
+        # Add gap at the beginning if needed
+        if (segments[0]['start'] > 0):
+            result.append({ 'start': 0, 'end': segments[0]['start'], 'gap': True } )
+        for i in range(len(segments) - 1):
+            expanded = False
+            current_segment = segments[i]
+            next_segment = segments[i + 1]
+            delta = next_segment['start'] - current_segment['end']
+            if (max_expand_size is not None and delta <= max_expand_size):
+                # Just expand the current segment
+                current_segment = current_segment.copy()
+                current_segment['expand_amount'] = delta
+                current_segment['end'] = next_segment['start']
+                expanded = True
+            result.append(current_segment)
+            # Add a gap to the next segment if needed
+            if (delta >= 0 and not expanded):
+                result.append({ 'start': current_segment['end'], 'end': next_segment['start'], 'gap': True } )
+        # Add last segment
+        last_segment = segments[-1]
+        result.append(last_segment)
+        # Also include total duration if specified
+        if (total_duration is not None):
+            last_segment = result[-1]
+            delta = total_duration - last_segment['end']
+            if (delta > 0):
+                if (max_expand_size is not None and delta <= max_expand_size):
+                    # Expand the last segment
+                    last_segment = last_segment.copy()
+                    last_segment['expand_amount'] = delta
+                    last_segment['end'] = total_duration
+                    result[-1] = last_segment
+                else:
+                    result.append({ 'start': last_segment['end'], 'end': total_duration, 'gap': True } )
+        return result
+    def adjust_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
+        result = []
+        for segment in segments:
+            segment_start = float(segment['start'])
+            segment_end = float(segment['end'])
+            # Filter segments?
+            if (max_source_time is not None):
+                if (segment_start > max_source_time):
+                    continue
+                segment_end = min(max_source_time, segment_end)
+                new_segment = segment.copy()
+            # Add to start and end
+            new_segment['start'] = segment_start + adjust_seconds
+            new_segment['end'] = segment_end + adjust_seconds
+            result.append(new_segment)
+        return result
+    def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):
+        result = []
+        for entry in timestamps:
+            start = entry['start']
+            end = entry['end']
+            result.append({
+                'start': start * factor,
+                'end': end * factor
+            })
+        return result
+class VadSileroTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000, cache: ModelCache = None):
+        super().__init__(sampling_rate=sampling_rate)
+        self.model = None
+        self.cache = cache
+        self._initialize_model()
+    def _initialize_model(self):
+        if (self.cache is not None):
+            model_key = "VadSileroTranscription"
+            self.model, self.get_speech_timestamps = self.cache.get(model_key, self._create_model)
+            print("Loaded Silerio model from cache.")
+        else:
+            self.model, self.get_speech_timestamps = self._create_model()
+            print("Created Silerio model")
+    def _create_model(self):
+        model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
+        # Silero does not benefit from multi-threading
+        torch.set_num_threads(1) # JIT
+        (get_speech_timestamps, _, _, _, _) = utils
+        return model, get_speech_timestamps
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, end_time: float):
+        result = []
+        print("Getting timestamps from audio file: {}, start: {}, duration: {}".format(audio, start_time, end_time))
+        perf_start_time = time.perf_counter()
+        # Divide procesisng of audio into chunks
+        chunk_start = start_time
+        while (chunk_start < end_time):
+            chunk_duration = min(end_time - chunk_start, VAD_MAX_PROCESSING_CHUNK)
+            print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
+            wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
+            sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
+            seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
+            adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
+            #pprint(adjusted)
+            result.extend(adjusted)
+            chunk_start += chunk_duration
+        perf_end_time = time.perf_counter()
+        print("VAD processing took {} seconds".format(perf_end_time - perf_start_time))
+        return result
+    def __getstate__(self):
+        # We only need the sampling rate
+        return { 'sampling_rate': self.sampling_rate }
+    def __setstate__(self, state):
+        self.sampling_rate = state['sampling_rate']
+        self.model = None
+        # Use the global cache
+        self.cache = GLOBAL_MODEL_CACHE
+        self._initialize_model()
+# A very simple VAD that just marks every N seconds as speech
+class VadPeriodicTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000):
+        super().__init__(sampling_rate=sampling_rate)
+    def is_transcribe_timestamps_fast(self):
+        # This is a very fast VAD - no need to parallelize it
+        return True
+    def get_transcribe_timestamps(self, audio: str, config: PeriodicTranscriptionConfig, start_time: float, end_time: float):
+        result = []
+        # Generate a timestamp every N seconds
+        start_timestamp = start_time
+        while (start_timestamp < end_time):
+            end_timestamp = min(start_timestamp + config.periodic_duration, end_time)
+            segment_duration = end_timestamp - start_timestamp
+            # Minimum duration is 1 second
+            if (segment_duration >= 1):
+                result.append( {  'start': start_timestamp, 'end': end_timestamp } )
+            start_timestamp = end_timestamp
+        return result
+def get_audio_duration(file: str):
+    return float(ffmpeg.probe(file)["format"]["duration"])
+def load_audio(file: str, sample_rate: int = 16000,
+               start_time: str = None, duration: str = None):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    start_time: str
+        The start time, using the standard FFMPEG time duration syntax, or None to disable.
+    duration: str
+        The duration, using the standard FFMPEG time duration syntax, or None to disable.
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    try:
+        inputArgs = {'threads': 0}
+        if (start_time is not None):
+            inputArgs['ss'] = start_time
+        if (duration is not None):
+            inputArgs['t'] = duration
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = (
+            ffmpeg.input(file, **inputArgs)
+            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sample_rate)
+            .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}")
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

vadParallel.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import multiprocessing
+import threading
+import time
+from src.vad import AbstractTranscription, TranscriptionConfig, get_audio_duration
+from src.whisperContainer import WhisperCallback
+from multiprocessing import Pool
+from typing import Any, Dict, List
+import os
+class ParallelContext:
+    def __init__(self, num_processes: int = None, auto_cleanup_timeout_seconds: float = None):
+        self.num_processes = num_processes
+        self.auto_cleanup_timeout_seconds = auto_cleanup_timeout_seconds
+        self.lock = threading.Lock()
+        self.ref_count = 0
+        self.pool = None
+        self.cleanup_timer = None
+    def get_pool(self):
+        # Initialize pool lazily
+        if (self.pool is None):
+            context = multiprocessing.get_context('spawn')
+            self.pool = context.Pool(self.num_processes)
+        self.ref_count = self.ref_count + 1
+        if (self.auto_cleanup_timeout_seconds is not None):
+            self._stop_auto_cleanup()
+        return self.pool
+    def return_pool(self, pool):
+        if (self.pool == pool and self.ref_count > 0):
+            self.ref_count = self.ref_count - 1
+            if (self.ref_count == 0):
+                if (self.auto_cleanup_timeout_seconds is not None):
+                    self._start_auto_cleanup()
+    def _start_auto_cleanup(self):
+        if (self.cleanup_timer is not None):
+            self.cleanup_timer.cancel()
+        self.cleanup_timer = threading.Timer(self.auto_cleanup_timeout_seconds, self._execute_cleanup)
+        self.cleanup_timer.start()
+        print("Started auto cleanup of pool in " + str(self.auto_cleanup_timeout_seconds) + " seconds")
+    def _stop_auto_cleanup(self):
+        if (self.cleanup_timer is not None):
+            self.cleanup_timer.cancel()
+            self.cleanup_timer = None
+            print("Stopped auto cleanup of pool")
+    def _execute_cleanup(self):
+        print("Executing cleanup of pool")
+        if (self.ref_count == 0):
+            self.close()
+    def close(self):
+        self._stop_auto_cleanup()
+        if (self.pool is not None):
+            print("Closing pool of " + str(self.num_processes) + " processes")
+            self.pool.close()
+            self.pool.join()
+        self.pool = None
+class ParallelTranscriptionConfig(TranscriptionConfig):
+    def __init__(self, device_id: str, override_timestamps, initial_segment_index, copy: TranscriptionConfig = None):
+        super().__init__(copy.non_speech_strategy, copy.segment_padding_left, copy.segment_padding_right, copy.max_silent_period, copy.max_merge_size, copy.max_prompt_window, initial_segment_index)
+        self.device_id = device_id
+        self.override_timestamps = override_timestamps
+class ParallelTranscription(AbstractTranscription):
+    # Silero VAD typically takes about 3 seconds per minute, so there's no need to split the chunks
+    # into smaller segments than 2 minute (min 6 seconds per CPU core)
+    MIN_CPU_CHUNK_SIZE_SECONDS = 2 * 60
+    def __init__(self, sampling_rate: int = 16000):
+        super().__init__(sampling_rate=sampling_rate)
+    def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig,
+                            cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None):
+        total_duration = get_audio_duration(audio)
+        # First, get the timestamps for the original audio
+        if (cpu_device_count > 1 and not transcription.is_transcribe_timestamps_fast()):
+            merged = self._get_merged_timestamps_parallel(transcription, audio, config, total_duration, cpu_device_count, cpu_parallel_context)
+        else:
+            timestamp_segments = transcription.get_transcribe_timestamps(audio, config, 0, total_duration)
+            merged = transcription.get_merged_timestamps(timestamp_segments, config, total_duration)
+        # We must make sure the whisper model is downloaded
+        if (len(gpu_devices) > 1):
+            whisperCallable.model_container.ensure_downloaded()
+        # Split into a list for each device
+        # TODO: Split by time instead of by number of chunks
+        merged_split = list(self._split(merged, len(gpu_devices)))
+        # Parameters that will be passed to the transcribe function
+        parameters = []
+        segment_index = config.initial_segment_index
+        for i in range(len(gpu_devices)):
+            # Note that device_segment_list can be empty. But we will still create a process for it,
+            # as otherwise we run the risk of assigning the same device to multiple processes.
+            device_segment_list = list(merged_split[i]) if i < len(merged_split) else []
+            device_id = gpu_devices[i]
+            print("Device " + str(device_id) + " (index " + str(i) + ") has " + str(len(device_segment_list)) + " segments")
+            # Create a new config with the given device ID
+            device_config = ParallelTranscriptionConfig(device_id, device_segment_list, segment_index, config)
+            segment_index += len(device_segment_list)
+            parameters.append([audio, whisperCallable, device_config]);
+        merged = {
+            'text': '',
+            'segments': [],
+            'language': None
+        }
+        created_context = False
+        perf_start_gpu = time.perf_counter()
+        # Spawn a separate process for each device
+        try:
+            if (gpu_parallel_context is None):
+                gpu_parallel_context = ParallelContext(len(gpu_devices))
+                created_context = True
+            # Get a pool of processes
+            pool = gpu_parallel_context.get_pool()
+            # Run the transcription in parallel
+            results = pool.starmap(self.transcribe, parameters)
+            for result in results:
+                # Merge the results
+                if (result['text'] is not None):
+                    merged['text'] += result['text']
+                if (result['segments'] is not None):
+                    merged['segments'].extend(result['segments'])
+                if (result['language'] is not None):
+                    merged['language'] = result['language']
+        finally:
+            # Return the pool to the context
+            if (gpu_parallel_context is not None):
+                gpu_parallel_context.return_pool(pool)
+            # Always close the context if we created it
+            if (created_context):
+                gpu_parallel_context.close()
+        perf_end_gpu = time.perf_counter()
+        print("Parallel transcription took " + str(perf_end_gpu - perf_start_gpu) + " seconds")
+        return merged
+    def _get_merged_timestamps_parallel(self, transcription: AbstractTranscription, audio: str, config: TranscriptionConfig, total_duration: float,
+                                       cpu_device_count: int, cpu_parallel_context: ParallelContext = None):
+        parameters = []
+        chunk_size = max(total_duration / cpu_device_count, self.MIN_CPU_CHUNK_SIZE_SECONDS)
+        chunk_start = 0
+        cpu_device_id = 0
+        perf_start_time = time.perf_counter()
+        # Create chunks that will be processed on the CPU
+        while (chunk_start < total_duration):
+            chunk_end = min(chunk_start + chunk_size, total_duration)
+            if (chunk_end - chunk_start < 1):
+                # No need to process chunks that are less than 1 second
+                break
+            print("Parallel VAD: Executing chunk from " + str(chunk_start) + " to " +
+                    str(chunk_end) + " on CPU device " + str(cpu_device_id))
+            parameters.append([audio, config, chunk_start, chunk_end]);
+            cpu_device_id += 1
+            chunk_start = chunk_end
+        created_context = False
+        # Spawn a separate process for each device
+        try:
+            if (cpu_parallel_context is None):
+                cpu_parallel_context = ParallelContext(cpu_device_count)
+                created_context = True
+            # Get a pool of processes
+            pool = cpu_parallel_context.get_pool()
+            # Run the transcription in parallel. Note that transcription must be picklable.
+            results = pool.starmap(transcription.get_transcribe_timestamps, parameters)
+            timestamps = []
+            # Flatten the results
+            for result in results:
+                timestamps.extend(result)
+            merged = transcription.get_merged_timestamps(timestamps, config, total_duration)
+            perf_end_time = time.perf_counter()
+            print("Parallel VAD processing took {} seconds".format(perf_end_time - perf_start_time))
+            return merged
+        finally:
+            # Return the pool to the context
+            if (cpu_parallel_context is not None):
+                cpu_parallel_context.return_pool(pool)
+            # Always close the context if we created it
+            if (created_context):
+                cpu_parallel_context.close()
+    def get_transcribe_timestamps(self, audio: str, config: ParallelTranscriptionConfig, start_time: float, duration: float):
+        return []
+    def get_merged_timestamps(self,  timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
+        # Override timestamps that will be processed
+        if (config.override_timestamps is not None):
+            print("Using override timestamps of size " + str(len(config.override_timestamps)))
+            return config.override_timestamps
+        return super().get_merged_timestamps(timestamps, config, total_duration)
+    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig):
+        # Override device ID the first time
+        if (os.environ.get("INITIALIZED", None) is None):
+            os.environ["INITIALIZED"] = "1"
+            # Note that this may be None if the user didn't specify a device. In that case, Whisper will
+            # just use the default GPU device.
+            if (config.device_id is not None):
+                print("Using device " + config.device_id)
+                os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
+        return super().transcribe(audio, whisperCallable, config)
+    def _split(self, a, n):
+        """Split a list into n approximately equal parts."""
+        k, m = divmod(len(a), n)
+        return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

whisperContainer.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# External programs
+import os
+import whisper
+from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
+class WhisperContainer:
+    def __init__(self, model_name: str, device: str = None, download_root: str = None, cache: ModelCache = None):
+        self.model_name = model_name
+        self.device = device
+        self.download_root = download_root
+        self.cache = cache
+        # Will be created on demand
+        self.model = None
+    def get_model(self):
+        if self.model is None:
+            if (self.cache is None):
+                self.model = self._create_model()
+            else:
+                model_key = "WhisperContainer." + self.model_name + ":" + (self.device if self.device else '')
+                self.model = self.cache.get(model_key, self._create_model)
+        return self.model
+    def ensure_downloaded(self):
+        """
+        Ensure that the model is downloaded. This is useful if you want to ensure that the model is downloaded before
+        passing the container to a subprocess.
+        """
+        # Warning: Using private API here
+        try:
+            root_dir = self.download_root
+            if root_dir is None:
+                root_dir = os.path.join(os.path.expanduser("~"), ".cache", "whisper")
+            if self.model_name in whisper._MODELS:
+                whisper._download(whisper._MODELS[self.model_name], root_dir, False)
+            return True
+        except Exception as e:
+            # Given that the API is private, it could change at any time. We don't want to crash the program
+            print("Error pre-downloading model: " + str(e))
+            return False
+    def _create_model(self):
+        print("Loading whisper model " + self.model_name)
+        return whisper.load_model(self.model_name, device=self.device, download_root=self.download_root)
+    def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
+        """
+        Create a WhisperCallback object that can be used to transcript audio files.
+        Parameters
+        ----------
+        language: str
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        initial_prompt: str
+            The initial prompt to use for the transcription.
+        decodeOptions: dict
+            Additional options to pass to the decoder. Must be pickleable.
+        Returns
+        -------
+        A WhisperCallback object.
+        """
+        return WhisperCallback(self, language=language, task=task, initial_prompt=initial_prompt, **decodeOptions)
+    # This is required for multiprocessing
+    def __getstate__(self):
+        return { "model_name": self.model_name, "device": self.device, "download_root": self.download_root }
+    def __setstate__(self, state):
+        self.model_name = state["model_name"]
+        self.device = state["device"]
+        self.download_root = state["download_root"]
+        self.model = None
+        # Depickled objects must use the global cache
+        self.cache = GLOBAL_MODEL_CACHE
+class WhisperCallback:
+    def __init__(self, model_container: WhisperContainer, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
+        self.model_container = model_container
+        self.language = language
+        self.task = task
+        self.initial_prompt = initial_prompt
+        self.decodeOptions = decodeOptions
+    def invoke(self, audio, segment_index: int, prompt: str, detected_language: str):
+        """
+        Peform the transcription of the given audio file or data.
+        Parameters
+        ----------
+        audio: Union[str, np.ndarray, torch.Tensor]
+            The audio file to transcribe, or the audio data as a numpy array or torch tensor.
+        segment_index: int
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        prompt: str
+            The prompt to use for the transcription.
+        detected_language: str
+            The detected language of the audio file.
+        Returns
+        -------
+        The result of the Whisper call.
+        """
+        model = self.model_container.get_model()
+        return model.transcribe(audio, \
+                 language=self.language if self.language else detected_language, task=self.task, \
+                 initial_prompt=self._concat_prompt(self.initial_prompt, prompt) if segment_index == 0 else prompt, \
+                 **self.decodeOptions)
+    def _concat_prompt(self, prompt1, prompt2):
+        if (prompt1 is None):
+            return prompt2
+        elif (prompt2 is None):
+            return prompt1
+        else:
+            return prompt1 + " " + prompt2