faster-whisper-webui-v2

Running

App Files Files Community

aadnk commited on Apr 28, 2023

Commit

a8eb534

•

2 Parent(s): 0819c3a 43189ac

Merge branch 'main' of https://huggingface.co/spaces/aadnk/whisper-webui into main

Browse files

Files changed (7) hide show

app.py +72 -31
cli.py +17 -2
config.json5 +10 -1
src/config.py +11 -1
src/utils.py +118 -8
src/vad.py +8 -0
src/whisper/whisperContainer.py +3 -2

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from datetime import datetime
 import math
 from typing import Iterator, Union
 import argparse
@@ -28,7 +29,7 @@ import ffmpeg
 import gradio as gr
 from src.download import ExceededMaximumDuration, download_url
-from src.utils import slugify, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
 from src.whisper.whisperFactory import create_whisper_container
@@ -84,37 +85,49 @@ class WhisperTranscriber:
             print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
     # Entry function for the simple tab
-    def transcribe_webui_simple(self, modelName, languageName, urlData, multipleFiles, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
-        return self.transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
     # Entry function for the simple tab progress
-    def transcribe_webui_simple_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow,
-                                progress=gr.Progress()):
-        vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, self.app_config.vad_initial_prompt_mode)
-        return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions, progress=progress)
     # Entry function for the full tab
     def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
-                                vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
-                                initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
-                                condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
-                                compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float):
         return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                 vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                                 initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
                                 condition_on_previous_text, fp16, temperature_increment_on_fallback,
                                 compression_ratio_threshold, logprob_threshold, no_speech_threshold)
     # Entry function for the full tab with progress
     def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
-                                    vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
-                                    initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
-                                    condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
-                                    compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
-                                    progress=gr.Progress()):
         # Handle temperature_increment_on_fallback
         if temperature_increment_on_fallback is not None:
@@ -128,13 +141,15 @@ class WhisperTranscriber:
                                      initial_prompt=initial_prompt, temperature=temperature, best_of=best_of, beam_size=beam_size, patience=patience, length_penalty=length_penalty, suppress_tokens=suppress_tokens,
                                      condition_on_previous_text=condition_on_previous_text, fp16=fp16,
                                      compression_ratio_threshold=compression_ratio_threshold, logprob_threshold=logprob_threshold, no_speech_threshold=no_speech_threshold,
                                      progress=progress)
     def transcribe_webui(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
-                         vadOptions: VadOptions, progress: gr.Progress = None, **decodeOptions: dict):
         try:
             sources = self.__get_source(urlData, multipleFiles, microphoneData)
             try:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
@@ -185,7 +200,7 @@ class WhisperTranscriber:
                     # Update progress
                     current_progress += source_audio_duration
-                    source_download, source_text, source_vtt = self.write_result(result, filePrefix, outputDirectory)
                     if len(sources) > 1:
                         # Add new line separators
@@ -359,7 +374,7 @@ class WhisperTranscriber:
         return config
-    def write_result(self, result: dict, source_name: str, output_dir: str):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
@@ -368,13 +383,15 @@ class WhisperTranscriber:
         languageMaxLineWidth = self.__get_max_line_width(language)
         print("Max line width " + str(languageMaxLineWidth))
-        vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth)
-        srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth)
         output_files = []
         output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
         output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
         output_files.append(self.__create_file(text, output_dir, source_name + "-transcript.txt"));
         return output_files, text, vtt
@@ -394,13 +411,13 @@ class WhisperTranscriber:
             # 80 latin characters should fit on a 1080p/720p screen
             return 80
-    def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
         segmentStream = StringIO()
         if format == 'vtt':
-            write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
         elif format == 'srt':
-            write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
         else:
             raise Exception("Unknown format " + format)
@@ -460,24 +477,34 @@ def create_ui(app_config: ApplicationConfig):
     whisper_models = app_config.get_model_names()
-    simple_inputs = lambda : [
         gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
         gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.File(label="Upload Files", file_count="multiple"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
         gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
         gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
-        gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
-        gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
     ]
     is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
     simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
-                                     description=ui_description, article=ui_article, inputs=simple_inputs(), outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),
         gr.Text(label="Segments")
@@ -487,8 +514,17 @@ def create_ui(app_config: ApplicationConfig):
     full_transcribe = gr.Interface(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
                                    description=full_description, article=ui_article, inputs=[
-        *simple_inputs(),
         gr.Dropdown(choices=["prepend_first_segment", "prepend_all_segments"], value=app_config.vad_initial_prompt_mode, label="VAD - Initial Prompt Mode"),
         gr.TextArea(label="Initial Prompt"),
         gr.Number(label="Temperature", value=app_config.temperature),
         gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
@@ -501,7 +537,7 @@ def create_ui(app_config: ApplicationConfig):
         gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback),
         gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
         gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
-        gr.Number(label="No speech threshold", value=app_config.no_speech_threshold)
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),
@@ -560,9 +596,14 @@ if __name__ == '__main__':
                         help="the Whisper implementation to use")
     parser.add_argument("--compute_type", type=str, default=default_app_config.compute_type, choices=["default", "auto", "int8", "int8_float16", "int16", "float16", "float32"], \
                         help="the compute type to use for inference")
     args = parser.parse_args().__dict__
     updated_config = default_app_config.update(**args)
     create_ui(app_config=updated_config)

 from datetime import datetime
+import json
 import math
 from typing import Iterator, Union
 import argparse
 import gradio as gr
 from src.download import ExceededMaximumDuration, download_url
+from src.utils import optional_int, slugify, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
 from src.whisper.whisperFactory import create_whisper_container
             print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
     # Entry function for the simple tab
+    def transcribe_webui_simple(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
+                                vad, vadMergeWindow, vadMaxMergeSize,
+                                word_timestamps: bool = False, highlight_words: bool = False):
+        return self.transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
+                                                     vad, vadMergeWindow, vadMaxMergeSize,
+                                                     word_timestamps, highlight_words)
     # Entry function for the simple tab progress
+    def transcribe_webui_simple_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
+                                         vad, vadMergeWindow, vadMaxMergeSize,
+                                         word_timestamps: bool = False, highlight_words: bool = False,
+                                         progress=gr.Progress()):
+        vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
+        return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
+                                     word_timestamps=word_timestamps, highlight_words=highlight_words, progress=progress)
     # Entry function for the full tab
     def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
+                              vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
+                              # Word timestamps
+                              word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
+                              initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
+                              condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
+                              compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float):
         return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                 vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
+                                word_timestamps, highlight_words, prepend_punctuations, append_punctuations,
                                 initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
                                 condition_on_previous_text, fp16, temperature_increment_on_fallback,
                                 compression_ratio_threshold, logprob_threshold, no_speech_threshold)
     # Entry function for the full tab with progress
     def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
+                                        vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
+                                        # Word timestamps
+                                        word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
+                                        initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
+                                        condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
+                                        compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
+                                        progress=gr.Progress()):
         # Handle temperature_increment_on_fallback
         if temperature_increment_on_fallback is not None:
                                      initial_prompt=initial_prompt, temperature=temperature, best_of=best_of, beam_size=beam_size, patience=patience, length_penalty=length_penalty, suppress_tokens=suppress_tokens,
                                      condition_on_previous_text=condition_on_previous_text, fp16=fp16,
                                      compression_ratio_threshold=compression_ratio_threshold, logprob_threshold=logprob_threshold, no_speech_threshold=no_speech_threshold,
+                                     word_timestamps=word_timestamps, prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, highlight_words=highlight_words,
                                      progress=progress)
     def transcribe_webui(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
+                         vadOptions: VadOptions, progress: gr.Progress = None, highlight_words: bool = False,
+                         **decodeOptions: dict):
         try:
             sources = self.__get_source(urlData, multipleFiles, microphoneData)
             try:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
                     # Update progress
                     current_progress += source_audio_duration
+                    source_download, source_text, source_vtt = self.write_result(result, filePrefix, outputDirectory, highlight_words)
                     if len(sources) > 1:
                         # Add new line separators
         return config
+    def write_result(self, result: dict, source_name: str, output_dir: str, highlight_words: bool = False):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
         languageMaxLineWidth = self.__get_max_line_width(language)
         print("Max line width " + str(languageMaxLineWidth))
+        vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
+        srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
+        json_result = json.dumps(result, indent=4, ensure_ascii=False)
         output_files = []
         output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
         output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
         output_files.append(self.__create_file(text, output_dir, source_name + "-transcript.txt"));
+        output_files.append(self.__create_file(json_result, output_dir, source_name + "-result.json"));
         return output_files, text, vtt
             # 80 latin characters should fit on a 1080p/720p screen
             return 80
+    def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int, highlight_words: bool = False) -> str:
         segmentStream = StringIO()
         if format == 'vtt':
+            write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words)
         elif format == 'srt':
+            write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words)
         else:
             raise Exception("Unknown format " + format)
     whisper_models = app_config.get_model_names()
+    common_inputs = lambda : [
         gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
         gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.File(label="Upload Files", file_count="multiple"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
+    ]
+    common_vad_inputs = lambda : [
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
         gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
         gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
+    ]
+    common_word_timestamps_inputs = lambda : [
+        gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps),
+        gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words),
     ]
     is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
     simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
+                                     description=ui_description, article=ui_article, inputs=[
+        *common_inputs(),
+        *common_vad_inputs(),
+        *common_word_timestamps_inputs(),
+    ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),
         gr.Text(label="Segments")
     full_transcribe = gr.Interface(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
                                    description=full_description, article=ui_article, inputs=[
+        *common_inputs(),
+        *common_vad_inputs(),
+        gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
+        gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
         gr.Dropdown(choices=["prepend_first_segment", "prepend_all_segments"], value=app_config.vad_initial_prompt_mode, label="VAD - Initial Prompt Mode"),
+        *common_word_timestamps_inputs(),
+        gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
+        gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
         gr.TextArea(label="Initial Prompt"),
         gr.Number(label="Temperature", value=app_config.temperature),
         gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
         gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback),
         gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
         gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
+        gr.Number(label="No speech threshold", value=app_config.no_speech_threshold),
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),
                         help="the Whisper implementation to use")
     parser.add_argument("--compute_type", type=str, default=default_app_config.compute_type, choices=["default", "auto", "int8", "int8_float16", "int16", "float16", "float32"], \
                         help="the compute type to use for inference")
+    parser.add_argument("--threads", type=optional_int, default=0,
+                        help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
     args = parser.parse_args().__dict__
     updated_config = default_app_config.update(**args)
+    if (threads := args.pop("threads")) > 0:
+        torch.set_num_threads(threads)
     create_ui(app_config=updated_config)

cli.py CHANGED Viewed

@@ -95,6 +95,17 @@ def cli():
     parser.add_argument("--no_speech_threshold", type=optional_float, default=app_config.no_speech_threshold, \
                         help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
     args = parser.parse_args().__dict__
     model_name: str = args.pop("model")
     model_dir: str = args.pop("model_dir")
@@ -102,6 +113,9 @@ def cli():
     device: str = args.pop("device")
     os.makedirs(output_dir, exist_ok=True)
     whisper_implementation = args.pop("whisper_implementation")
     print(f"Using {whisper_implementation} for Whisper")
@@ -126,6 +140,7 @@ def cli():
     auto_parallel = args.pop("auto_parallel")
     compute_type = args.pop("compute_type")
     transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores, app_config=app_config)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
@@ -133,7 +148,7 @@ def cli():
     model = create_whisper_container(whisper_implementation=whisper_implementation, model_name=model_name,
                                      device=device, compute_type=compute_type, download_root=model_dir, models=app_config.models)
     if (transcriber._has_parallel_devices()):
         print("Using parallel devices:", transcriber.parallel_device_list)
@@ -158,7 +173,7 @@ def cli():
             result = transcriber.transcribe_file(model, source_path, temperature=temperature, vadOptions=vadOptions, **args)
-            transcriber.write_result(result, source_name, output_dir)
     transcriber.close()

     parser.add_argument("--no_speech_threshold", type=optional_float, default=app_config.no_speech_threshold, \
                         help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
+    parser.add_argument("--word_timestamps", type=str2bool, default=app_config.word_timestamps,
+                        help="(experimental) extract word-level timestamps and refine the results based on them")
+    parser.add_argument("--prepend_punctuations", type=str, default=app_config.prepend_punctuations,
+                        help="if word_timestamps is True, merge these punctuation symbols with the next word")
+    parser.add_argument("--append_punctuations", type=str, default=app_config.append_punctuations,
+                        help="if word_timestamps is True, merge these punctuation symbols with the previous word")
+    parser.add_argument("--highlight_words", type=str2bool, default=app_config.highlight_words,
+                        help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
+    parser.add_argument("--threads", type=optional_int, default=0,
+                        help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
     args = parser.parse_args().__dict__
     model_name: str = args.pop("model")
     model_dir: str = args.pop("model_dir")
     device: str = args.pop("device")
     os.makedirs(output_dir, exist_ok=True)
+    if (threads := args.pop("threads")) > 0:
+        torch.set_num_threads(threads)
     whisper_implementation = args.pop("whisper_implementation")
     print(f"Using {whisper_implementation} for Whisper")
     auto_parallel = args.pop("auto_parallel")
     compute_type = args.pop("compute_type")
+    highlight_words = args.pop("highlight_words")
     transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores, app_config=app_config)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
     model = create_whisper_container(whisper_implementation=whisper_implementation, model_name=model_name,
                                      device=device, compute_type=compute_type, download_root=model_dir, models=app_config.models)
     if (transcriber._has_parallel_devices()):
         print("Using parallel devices:", transcriber.parallel_device_list)
             result = transcriber.transcribe_file(model, source_path, temperature=temperature, vadOptions=vadOptions, **args)
+            transcriber.write_result(result, source_name, output_dir, highlight_words)
     transcriber.close()

config.json5 CHANGED Viewed

@@ -128,5 +128,14 @@
     // If the average log probability is lower than this value, treat the decoding as failed
     "logprob_threshold": -1.0,
     // If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
-    "no_speech_threshold": 0.6
 }

     // If the average log probability is lower than this value, treat the decoding as failed
     "logprob_threshold": -1.0,
     // If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
+    "no_speech_threshold": 0.6,
+    // (experimental) extract word-level timestamps and refine the results based on them
+    "word_timestamps": false,
+    // if word_timestamps is True, merge these punctuation symbols with the next word
+    "prepend_punctuations": "\"\'“¿([{-",
+    // if word_timestamps is True, merge these punctuation symbols with the previous word
+    "append_punctuations": "\"\'.。,，!！?？:：”)]}、",
+    // (requires --word_timestamps True) underline each word as it is spoken in srt and vtt
+    "highlight_words": false,
 }

src/config.py CHANGED Viewed

@@ -58,7 +58,11 @@ class ApplicationConfig:
                  condition_on_previous_text: bool = True, fp16: bool = True,
                  compute_type: str = "float16",
                  temperature_increment_on_fallback: float = 0.2, compression_ratio_threshold: float = 2.4,
-                 logprob_threshold: float = -1.0, no_speech_threshold: float = 0.6):
         self.models = models
@@ -104,6 +108,12 @@ class ApplicationConfig:
         self.logprob_threshold = logprob_threshold
         self.no_speech_threshold = no_speech_threshold
     def get_model_names(self):
         return [ x.name for x in self.models ]

                  condition_on_previous_text: bool = True, fp16: bool = True,
                  compute_type: str = "float16",
                  temperature_increment_on_fallback: float = 0.2, compression_ratio_threshold: float = 2.4,
+                 logprob_threshold: float = -1.0, no_speech_threshold: float = 0.6,
+                 # Word timestamp settings
+                 word_timestamps: bool = False, prepend_punctuations: str = "\"\'“¿([{-",
+                 append_punctuations: str = "\"\'.。,，!！?？:：”)]}、",
+                 highlight_words: bool = False):
         self.models = models
         self.logprob_threshold = logprob_threshold
         self.no_speech_threshold = no_speech_threshold
+        # Word timestamp settings
+        self.word_timestamps = word_timestamps
+        self.prepend_punctuations = prepend_punctuations
+        self.append_punctuations = append_punctuations
+        self.highlight_words = highlight_words
     def get_model_names(self):
         return [ x.name for x in self.models ]

src/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ import unicodedata
 import re
 import zlib
-from typing import Iterator, TextIO
 import tqdm
 import urllib3
@@ -56,10 +56,14 @@ def write_txt(transcript: Iterator[dict], file: TextIO):
         print(segment['text'].strip(), file=file, flush=True)
-def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
     print("WEBVTT\n", file=file)
-    for segment in transcript:
-        text = process_text(segment['text'], maxLineWidth).replace('-->', '->')
         print(
             f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
@@ -68,8 +72,8 @@ def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
             flush=True,
         )
-def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
     """
     Write a transcript to a file in SRT format.
     Example usage:
@@ -81,8 +85,10 @@ def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
         with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
             write_srt(result["segments"], file=srt)
     """
-    for i, segment in enumerate(transcript, start=1):
-        text = process_text(segment['text'].strip(), maxLineWidth).replace('-->', '->')
         # write srt lines
         print(
@@ -94,6 +100,110 @@ def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
             flush=True,
         )
 def process_text(text: str, maxLineWidth=None):
     if (maxLineWidth is None or maxLineWidth < 0):
         return text

 import re
 import zlib
+from typing import Iterator, TextIO, Union
 import tqdm
 import urllib3
         print(segment['text'].strip(), file=file, flush=True)
+def write_vtt(transcript: Iterator[dict], file: TextIO,
+              maxLineWidth=None, highlight_words: bool = False):
+    iterator  = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)
     print("WEBVTT\n", file=file)
+    for segment in iterator:
+        text = segment['text'].replace('-->', '->')
         print(
             f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
             flush=True,
         )
+def write_srt(transcript: Iterator[dict], file: TextIO,
+              maxLineWidth=None, highlight_words: bool = False):
     """
     Write a transcript to a file in SRT format.
     Example usage:
         with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
             write_srt(result["segments"], file=srt)
     """
+    iterator  = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)
+    for i, segment in enumerate(iterator, start=1):
+        text = segment['text'].replace('-->', '->')
         # write srt lines
         print(
             flush=True,
         )
+def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
+    for segment in transcript:
+        words = segment.get('words', [])
+        if len(words) == 0:
+            # Yield the segment as-is or processed
+            if maxLineWidth is None or maxLineWidth < 0:
+                yield segment
+            else:
+                yield {
+                    'start': segment['start'],
+                    'end': segment['end'],
+                    'text': process_text(segment['text'].strip(), maxLineWidth)
+                }
+            # We are done
+            continue
+        subtitle_start = segment['start']
+        subtitle_end = segment['end']
+        text_words = [ this_word["word"] for this_word in words ]
+        subtitle_text = __join_words(text_words, maxLineWidth)
+        # Iterate over the words in the segment
+        if highlight_words:
+            last = subtitle_start
+            for i, this_word in enumerate(words):
+                start = this_word['start']
+                end = this_word['end']
+                if last != start:
+                    # Display the text up to this point
+                    yield {
+                        'start': last,
+                        'end': start,
+                        'text': subtitle_text
+                    }
+                # Display the text with the current word highlighted
+                yield {
+                    'start': start,
+                    'end': end,
+                    'text': __join_words(
+                        [
+                            {
+                                "word": re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
+                                        if j == i
+                                        else word,
+                                # The HTML tags <u> and </u> are not displayed,
+                                # # so they should not be counted in the word length
+                                "length": len(word)
+                            } for j, word in enumerate(text_words)
+                        ], maxLineWidth)
+                }
+                last = end
+            if last != subtitle_end:
+                # Display the last part of the text
+                yield {
+                    'start': last,
+                    'end': subtitle_end,
+                    'text': subtitle_text
+                }
+        # Just return the subtitle text
+        else:
+            yield {
+                'start': subtitle_start,
+                'end': subtitle_end,
+                'text': subtitle_text
+            }
+def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
+    if maxLineWidth is None or maxLineWidth < 0:
+        return " ".join(words)
+    lines = []
+    current_line = ""
+    current_length = 0
+    for entry in words:
+        # Either accept a string or a dict with a 'word' and 'length' field
+        if isinstance(entry, dict):
+            word = entry['word']
+            word_length = entry['length']
+        else:
+            word = entry
+            word_length = len(word)
+        if current_length > 0 and current_length + word_length > maxLineWidth:
+            lines.append(current_line)
+            current_line = ""
+            current_length = 0
+        current_length += word_length
+        # The word will be prefixed with a space by Whisper, so we don't need to add one here
+        current_line += word
+    if len(current_line) > 0:
+        lines.append(current_line)
+    return "\n".join(lines)
 def process_text(text: str, maxLineWidth=None):
     if (maxLineWidth is None or maxLineWidth < 0):
         return text

src/vad.py CHANGED Viewed

@@ -404,6 +404,14 @@ class AbstractTranscription(ABC):
             # Add to start and end
             new_segment['start'] = segment_start + adjust_seconds
             new_segment['end'] = segment_end + adjust_seconds
             result.append(new_segment)
         return result

             # Add to start and end
             new_segment['start'] = segment_start + adjust_seconds
             new_segment['end'] = segment_end + adjust_seconds
+            # Handle words
+            if ('words' in new_segment):
+                for word in new_segment['words']:
+                    # Adjust start and end
+                    word['start'] = word['start'] + adjust_seconds
+                    word['end'] = word['end'] + adjust_seconds
             result.append(new_segment)
         return result

src/whisper/whisperContainer.py CHANGED Viewed

@@ -203,8 +203,9 @@ class WhisperCallback(AbstractWhisperCallback):
         initial_prompt = self._get_initial_prompt(self.initial_prompt, self.initial_prompt_mode, prompt, segment_index)
-        return model.transcribe(audio, \
             language=self.language if self.language else detected_language, task=self.task, \
             initial_prompt=initial_prompt, \
             **decodeOptions
-        )

         initial_prompt = self._get_initial_prompt(self.initial_prompt, self.initial_prompt_mode, prompt, segment_index)
+        result = model.transcribe(audio, \
             language=self.language if self.language else detected_language, task=self.task, \
             initial_prompt=initial_prompt, \
             **decodeOptions
+        )
+        return result