Spaces:

aadnk
/

whisper-webui

Running

App Files Files Community

aadnk commited on Oct 22, 2022

Commit

5bbbb16

•

1 Parent(s): f5884f3

Cleanup code

Browse files

Files changed (3) hide show

app.py +13 -13
src/segments.py +9 -1
src/vad.py +42 -49

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ import gradio as gr
 from src.download import ExceededMaximumDuration, download_url
 from src.utils import slugify, write_srt, write_vtt
-from src.vad import NonSpeechStrategy, VadPeriodicTranscription, VadSileroTranscription
 # Limitations (set to -1 to disable)
 DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
@@ -96,38 +96,38 @@ class WhisperTranscriber:
         # The results
         if (vad == 'silero-vad'):
             # Silero VAD where non-speech gaps are transcribed
-            process_gaps = self._create_silero_vad(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
-            result = process_gaps.transcribe(audio_path, whisperCallable)
         elif (vad == 'silero-vad-skip-gaps'):
             # Silero VAD where non-speech gaps are simply ignored
-            skip_gaps = self._create_silero_vad(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
-            result = skip_gaps.transcribe(audio_path, whisperCallable)
         elif (vad == 'silero-vad-expand-into-gaps'):
             # Use Silero VAD where speech-segments are expanded into non-speech gaps
-            expand_gaps = self._create_silero_vad(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
-            result = expand_gaps.transcribe(audio_path, whisperCallable)
         elif (vad == 'periodic-vad'):
             # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
             # it may create a break in the middle of a sentence, causing some artifacts.
-            periodic_vad = VadPeriodicTranscription(periodic_duration=vadMaxMergeSize)
-            result = periodic_vad.transcribe(audio_path, whisperCallable)
         else:
             # Default VAD
             result = whisperCallable(audio_path, None, None)
         return result
-    def _create_silero_vad(self, non_speech_strategy: NonSpeechStrategy, vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1):
         # Use Silero VAD
         if (self.vad_model is None):
             self.vad_model = VadSileroTranscription()
-        result = VadSileroTranscription(non_speech_strategy = non_speech_strategy,
                 max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
                 segment_padding_left=vadPadding, segment_padding_right=vadPadding,
-                max_prompt_window=vadPromptWindow, copy=self.vad_model)
-        return result
     def write_result(self, result: dict, source_name: str, output_dir: str):
         if not os.path.exists(output_dir):

 from src.download import ExceededMaximumDuration, download_url
 from src.utils import slugify, write_srt, write_vtt
+from src.vad import NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 # Limitations (set to -1 to disable)
 DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
         # The results
         if (vad == 'silero-vad'):
             # Silero VAD where non-speech gaps are transcribed
+            process_gaps = self._create_silero_config(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.vad_model.transcribe(audio_path, whisperCallable, process_gaps)
         elif (vad == 'silero-vad-skip-gaps'):
             # Silero VAD where non-speech gaps are simply ignored
+            skip_gaps = self._create_silero_config(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = skip_gaps.transcribe(audio_path, whisperCallable, skip_gaps)
         elif (vad == 'silero-vad-expand-into-gaps'):
             # Use Silero VAD where speech-segments are expanded into non-speech gaps
+            expand_gaps = self._create_silero_config(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = expand_gaps.transcribe(audio_path, whisperCallable, expand_gaps)
         elif (vad == 'periodic-vad'):
             # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
             # it may create a break in the middle of a sentence, causing some artifacts.
+            periodic_vad = VadPeriodicTranscription()
+            result = periodic_vad.transcribe(audio_path, whisperCallable, PeriodicTranscriptionConfig(periodic_duration=vadMaxMergeSize, max_prompt_window=vadPromptWindow))
         else:
             # Default VAD
             result = whisperCallable(audio_path, None, None)
         return result
+    def _create_silero_config(self, non_speech_strategy: NonSpeechStrategy, vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1):
         # Use Silero VAD
         if (self.vad_model is None):
             self.vad_model = VadSileroTranscription()
+        config = TranscriptionConfig(non_speech_strategy = non_speech_strategy,
                 max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
                 segment_padding_left=vadPadding, segment_padding_right=vadPadding,
+                max_prompt_window=vadPromptWindow)
+        return config
     def write_result(self, result: dict, source_name: str, output_dir: str):
         if not os.path.exists(output_dir):

src/segments.py CHANGED Viewed

@@ -7,6 +7,13 @@ def merge_timestamps(timestamps: List[Dict[str, Any]], merge_window: float = 5,
     if len(timestamps) == 0:
         return result
     processed_time = 0
     current_segment = None
@@ -17,7 +24,8 @@ def merge_timestamps(timestamps: List[Dict[str, Any]], merge_window: float = 5,
         delta = next_segment['start'] - processed_time
         # Note that segments can still be longer than the max merge size, they just won't be merged in that case
-        if current_segment is None or delta > merge_window or next_segment['end'] - current_segment['start'] > max_merge_size:
             # Finish the current segment
             if current_segment is not None:
                 # Add right padding

     if len(timestamps) == 0:
         return result
+    if max_merge_size is None:
+        return timestamps
+    if padding_left is None:
+        padding_left = 0
+    if padding_right is None:
+        padding_right = 0
     processed_time = 0
     current_segment = None
         delta = next_segment['start'] - processed_time
         # Note that segments can still be longer than the max merge size, they just won't be merged in that case
+        if current_segment is None or (merge_window is not None and delta > merge_window) \
+                 or next_segment['end'] - current_segment['start'] > max_merge_size:
             # Finish the current segment
             if current_segment is not None:
                 # Add right padding

src/vad.py CHANGED Viewed

@@ -38,45 +38,43 @@ class NonSpeechStrategy(Enum):
 # Defaults for Silero
 SPEECH_TRESHOLD = 0.3
-MAX_SILENT_PERIOD = 10 # seconds
-MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
-# Default segment padding
-SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
-SEGMENT_PADDING_RIGHT = 1 # End detected segments late
 # Minimum size of segments to process
 MIN_SEGMENT_DURATION = 1
-# Always merge segments that are less than this duration apart
-MIN_FORCE_MERGE_GAP = 0.5
-FORCE_MERGE_SEGMENT_MULTIPLIER = 1.5
 # The maximum time for texts from old segments to be used in the next segment
 MAX_PROMPT_WINDOW = 0 # seconds (0 = disabled)
 PROMPT_NO_SPEECH_PROB = 0.1 # Do not pass the text from segments with a no speech probability higher than this
 VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
-class AbstractTranscription(ABC):
-    def __init__(self, segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
-                       max_merge_size: float = None, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP, max_prompt_window: float = None):
-        self.sampling_rate = 16000
         self.segment_padding_left = segment_padding_left
         self.segment_padding_right = segment_padding_right
         self.max_silent_period = max_silent_period
         self.max_merge_size = max_merge_size
-        self.non_speech_strategy = non_speech_strategy
         self.max_prompt_window = max_prompt_window
-        self.min_force_merge_gap = MIN_FORCE_MERGE_GAP
-        self.max_force_merge_size = max_merge_size * FORCE_MERGE_SEGMENT_MULTIPLIER if max_merge_size is not None else None
     def get_audio_segment(self, str, start_time: str = None, duration: str = None):
         return load_audio(str, self.sampling_rate, start_time, duration)
     @abstractmethod
-    def get_transcribe_timestamps(self, audio: str):
         """
         Get the start and end timestamps of the sections that should be transcribed by this VAD method.
@@ -84,6 +82,8 @@ class AbstractTranscription(ABC):
         ----------
         audio: str
             The audio file.
         Returns
         -------
@@ -91,7 +91,7 @@ class AbstractTranscription(ABC):
         """
         return
-    def transcribe(self, audio: str, whisperCallable):
         """
         Transcribe the given audo file.
@@ -110,12 +110,12 @@ class AbstractTranscription(ABC):
         """
         # get speech timestamps from full audio file
-        seconds_timestamps = self.get_transcribe_timestamps(audio)
         #for seconds_timestamp in seconds_timestamps:
         #    print("VAD timestamp ", format_timestamp(seconds_timestamp['start']), " to ", format_timestamp(seconds_timestamp['end']))
-        merged = merge_timestamps(seconds_timestamps, self.max_silent_period, self.max_merge_size, self.segment_padding_left, self.segment_padding_right)
         # A deque of transcribed segments that is passed to the next segment as a prompt
         prompt_window = deque()
@@ -123,18 +123,18 @@ class AbstractTranscription(ABC):
         print("Timestamps:")
         pprint(merged)
-        if self.non_speech_strategy != NonSpeechStrategy.SKIP:
             max_audio_duration = get_audio_duration(audio)
             # Expand segments to include the gaps between them
-            if (self.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
                 # When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
-                merged = self.fill_gaps(merged, total_duration=max_audio_duration, max_expand_size=self.max_merge_size)
-            elif self.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
                 # With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
                 merged = self.expand_gaps(merged, total_duration=max_audio_duration)
             else:
-                raise Exception("Unknown non-speech strategy: " + str(self.non_speech_strategy))
             print("Transcribing non-speech:")
             pprint(merged)
@@ -193,15 +193,15 @@ class AbstractTranscription(ABC):
                 languageCounter[segment_result['language']] += 1
             # Update prompt window
-            self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap)
         if detected_language is not None:
             result['language'] = detected_language
         return result
-    def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float, segment_gap: bool = False):
-        if (self.max_prompt_window is not None and self.max_prompt_window > 0):
             # Add segments to the current prompt window (unless it is a speech gap)
             if not segment_gap:
                 for segment in adjusted_segments:
@@ -213,7 +213,7 @@ class AbstractTranscription(ABC):
                 # Time expanded in the segments should be discounted from the prompt window
                 first_expand_time = prompt_window[0].get('expand_amount', 0)
-                if (first_end_time - first_expand_time < segment_end - self.max_prompt_window):
                     prompt_window.popleft()
                 else:
                     break
@@ -371,20 +371,14 @@ class AbstractTranscription(ABC):
         return result
 class VadSileroTranscription(AbstractTranscription):
-    def __init__(self, segment_padding_left=SEGMENT_PADDING_LEFT, segment_padding_right=SEGMENT_PADDING_RIGHT,
-                 max_silent_period=MAX_SILENT_PERIOD, max_merge_size=MAX_MERGE_SIZE, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
-                 max_prompt_window=MAX_PROMPT_WINDOW, copy = None):
-        super().__init__(segment_padding_left=segment_padding_left, segment_padding_right=segment_padding_right,
-                         max_silent_period=max_silent_period, max_merge_size=max_merge_size, non_speech_strategy=non_speech_strategy, max_prompt_window=max_prompt_window)
-        if copy:
-            self.model = copy.model
-            self.get_speech_timestamps = copy.get_speech_timestamps
-        else:
-            self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
-            (self.get_speech_timestamps, _, _, _, _) = utils
-    def get_transcribe_timestamps(self, audio: str):
         audio_duration = get_audio_duration(audio)
         result = []
@@ -410,11 +404,10 @@ class VadSileroTranscription(AbstractTranscription):
 # A very simple VAD that just marks every N seconds as speech
 class VadPeriodicTranscription(AbstractTranscription):
-    def __init__(self, periodic_duration: float):
-        super().__init__()
-        self.periodic_duration = periodic_duration
-    def get_transcribe_timestamps(self, audio: str):
         # Get duration in seconds
         audio_duration = get_audio_duration(audio)
         result = []
@@ -423,7 +416,7 @@ class VadPeriodicTranscription(AbstractTranscription):
         start_timestamp = 0
         while (start_timestamp < audio_duration):
-            end_timestamp = min(start_timestamp + self.periodic_duration, audio_duration)
             segment_duration = end_timestamp - start_timestamp
             # Minimum duration is 1 second

 # Defaults for Silero
 SPEECH_TRESHOLD = 0.3
 # Minimum size of segments to process
 MIN_SEGMENT_DURATION = 1
 # The maximum time for texts from old segments to be used in the next segment
 MAX_PROMPT_WINDOW = 0 # seconds (0 = disabled)
 PROMPT_NO_SPEECH_PROB = 0.1 # Do not pass the text from segments with a no speech probability higher than this
 VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
+class TranscriptionConfig(ABC):
+    def __init__(self, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
+                       segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None):
+        self.non_speech_strategy = non_speech_strategy
         self.segment_padding_left = segment_padding_left
         self.segment_padding_right = segment_padding_right
         self.max_silent_period = max_silent_period
         self.max_merge_size = max_merge_size
         self.max_prompt_window = max_prompt_window
+class PeriodicTranscriptionConfig(TranscriptionConfig):
+    def __init__(self, periodic_duration: float, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
+                       segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None):
+        super().__init__(non_speech_strategy, segment_padding_left, segment_padding_right, max_silent_period, max_merge_size, max_prompt_window)
+        self.periodic_duration = periodic_duration
+class AbstractTranscription(ABC):
+    def __init__(self, sampling_rate: int = 16000):
+        self.sampling_rate = sampling_rate
     def get_audio_segment(self, str, start_time: str = None, duration: str = None):
         return load_audio(str, self.sampling_rate, start_time, duration)
     @abstractmethod
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig):
         """
         Get the start and end timestamps of the sections that should be transcribed by this VAD method.
         ----------
         audio: str
             The audio file.
+        config: TranscriptionConfig
+            The transcription configuration.
         Returns
         -------
         """
         return
+    def transcribe(self, audio: str, whisperCallable, config: TranscriptionConfig):
         """
         Transcribe the given audo file.
         """
         # get speech timestamps from full audio file
+        seconds_timestamps = self.get_transcribe_timestamps(audio, config)
         #for seconds_timestamp in seconds_timestamps:
         #    print("VAD timestamp ", format_timestamp(seconds_timestamp['start']), " to ", format_timestamp(seconds_timestamp['end']))
+        merged = merge_timestamps(seconds_timestamps, config.max_silent_period, config.max_merge_size, config.segment_padding_left, config.segment_padding_right)
         # A deque of transcribed segments that is passed to the next segment as a prompt
         prompt_window = deque()
         print("Timestamps:")
         pprint(merged)
+        if config.non_speech_strategy != NonSpeechStrategy.SKIP:
             max_audio_duration = get_audio_duration(audio)
             # Expand segments to include the gaps between them
+            if (config.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
                 # When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
+                merged = self.fill_gaps(merged, total_duration=max_audio_duration, max_expand_size=config.max_merge_size)
+            elif config.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
                 # With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
                 merged = self.expand_gaps(merged, total_duration=max_audio_duration)
             else:
+                raise Exception("Unknown non-speech strategy: " + str(config.non_speech_strategy))
             print("Transcribing non-speech:")
             pprint(merged)
                 languageCounter[segment_result['language']] += 1
             # Update prompt window
+            self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap, config)
         if detected_language is not None:
             result['language'] = detected_language
         return result
+    def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float, segment_gap: bool, config: TranscriptionConfig):
+        if (config.max_prompt_window is not None and config.max_prompt_window > 0):
             # Add segments to the current prompt window (unless it is a speech gap)
             if not segment_gap:
                 for segment in adjusted_segments:
                 # Time expanded in the segments should be discounted from the prompt window
                 first_expand_time = prompt_window[0].get('expand_amount', 0)
+                if (first_end_time - first_expand_time < segment_end - config.max_prompt_window):
                     prompt_window.popleft()
                 else:
                     break
         return result
 class VadSileroTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000):
+        super().__init__(sampling_rate=sampling_rate)
+        self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
+        (self.get_speech_timestamps, _, _, _, _) = utils
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig):
         audio_duration = get_audio_duration(audio)
         result = []
 # A very simple VAD that just marks every N seconds as speech
 class VadPeriodicTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000):
+        super().__init__(sampling_rate=sampling_rate)
+    def get_transcribe_timestamps(self, audio: str, config: PeriodicTranscriptionConfig):
         # Get duration in seconds
         audio_duration = get_audio_duration(audio)
         result = []
         start_timestamp = 0
         while (start_timestamp < audio_duration):
+            end_timestamp = min(start_timestamp + config.periodic_duration, audio_duration)
             segment_duration = end_timestamp - start_timestamp
             # Minimum duration is 1 second