aadnk commited on
Commit
9cae71a
1 Parent(s): f55c594

Add word timestamps to Simple and reorder

Browse files
Files changed (1) hide show
  1. app.py +52 -35
app.py CHANGED
@@ -84,44 +84,49 @@ class WhisperTranscriber:
84
  print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
85
 
86
  # Entry function for the simple tab
87
- def transcribe_webui_simple(self, modelName, languageName, urlData, multipleFiles, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
88
- return self.transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
 
 
 
 
89
 
90
  # Entry function for the simple tab progress
91
- def transcribe_webui_simple_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow,
92
- progress=gr.Progress()):
 
 
93
 
94
- vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, self.app_config.vad_initial_prompt_mode)
95
 
96
- return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions, progress=progress)
 
97
 
98
  # Entry function for the full tab
99
  def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
100
- vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
101
- initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
102
- condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
103
- compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
104
- # Word timestamps
105
- word_timestamps: bool, prepend_punctuations: str,
106
- append_punctuations: str, highlight_words: bool = False):
107
 
108
  return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
109
  vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
 
110
  initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
111
  condition_on_previous_text, fp16, temperature_increment_on_fallback,
112
- compression_ratio_threshold, logprob_threshold, no_speech_threshold,
113
- word_timestamps, prepend_punctuations, append_punctuations, highlight_words)
114
 
115
  # Entry function for the full tab with progress
116
  def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
117
- vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
118
- initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
119
- condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
120
- compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
121
- # Word timestamps
122
- word_timestamps: bool, prepend_punctuations: str,
123
- append_punctuations: str, highlight_words: bool = False,
124
- progress=gr.Progress()):
125
 
126
  # Handle temperature_increment_on_fallback
127
  if temperature_increment_on_fallback is not None:
@@ -469,24 +474,34 @@ def create_ui(app_config: ApplicationConfig):
469
 
470
  whisper_models = app_config.get_model_names()
471
 
472
- simple_inputs = lambda : [
473
  gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
474
  gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
475
  gr.Text(label="URL (YouTube, etc.)"),
476
  gr.File(label="Upload Files", file_count="multiple"),
477
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
478
  gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
 
 
 
479
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
480
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
481
  gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
482
- gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
483
- gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
 
 
 
484
  ]
485
 
486
  is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
487
 
488
  simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
489
- description=ui_description, article=ui_article, inputs=simple_inputs(), outputs=[
 
 
 
 
490
  gr.File(label="Download"),
491
  gr.Text(label="Transcription"),
492
  gr.Text(label="Segments")
@@ -496,8 +511,17 @@ def create_ui(app_config: ApplicationConfig):
496
 
497
  full_transcribe = gr.Interface(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
498
  description=full_description, article=ui_article, inputs=[
499
- *simple_inputs(),
 
 
 
 
500
  gr.Dropdown(choices=["prepend_first_segment", "prepend_all_segments"], value=app_config.vad_initial_prompt_mode, label="VAD - Initial Prompt Mode"),
 
 
 
 
 
501
  gr.TextArea(label="Initial Prompt"),
502
  gr.Number(label="Temperature", value=app_config.temperature),
503
  gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
@@ -511,13 +535,6 @@ def create_ui(app_config: ApplicationConfig):
511
  gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
512
  gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
513
  gr.Number(label="No speech threshold", value=app_config.no_speech_threshold),
514
-
515
- # Word timestamps
516
- gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps),
517
- gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
518
- gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
519
- gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words),
520
-
521
  ], outputs=[
522
  gr.File(label="Download"),
523
  gr.Text(label="Transcription"),
 
84
  print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
85
 
86
  # Entry function for the simple tab
87
+ def transcribe_webui_simple(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
88
+ vad, vadMergeWindow, vadMaxMergeSize,
89
+ word_timestamps: bool = False, highlight_words: bool = False):
90
+ return self.transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
91
+ vad, vadMergeWindow, vadMaxMergeSize,
92
+ word_timestamps, highlight_words)
93
 
94
  # Entry function for the simple tab progress
95
+ def transcribe_webui_simple_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
96
+ vad, vadMergeWindow, vadMaxMergeSize,
97
+ word_timestamps: bool = False, highlight_words: bool = False,
98
+ progress=gr.Progress()):
99
 
100
+ vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
101
 
102
+ return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
103
+ word_timestamps=word_timestamps, highlight_words=highlight_words, progress=progress)
104
 
105
  # Entry function for the full tab
106
  def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
107
+ vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
108
+ # Word timestamps
109
+ word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
110
+ initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
111
+ condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
112
+ compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float):
 
113
 
114
  return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
115
  vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
116
+ word_timestamps, highlight_words, prepend_punctuations, append_punctuations,
117
  initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
118
  condition_on_previous_text, fp16, temperature_increment_on_fallback,
119
+ compression_ratio_threshold, logprob_threshold, no_speech_threshold)
 
120
 
121
  # Entry function for the full tab with progress
122
  def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
123
+ vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
124
+ # Word timestamps
125
+ word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
126
+ initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
127
+ condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
128
+ compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
129
+ progress=gr.Progress()):
 
130
 
131
  # Handle temperature_increment_on_fallback
132
  if temperature_increment_on_fallback is not None:
 
474
 
475
  whisper_models = app_config.get_model_names()
476
 
477
+ common_inputs = lambda : [
478
  gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
479
  gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
480
  gr.Text(label="URL (YouTube, etc.)"),
481
  gr.File(label="Upload Files", file_count="multiple"),
482
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
483
  gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
484
+ ]
485
+
486
+ common_vad_inputs = lambda : [
487
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
488
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
489
  gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
490
+ ]
491
+
492
+ common_word_timestamps_inputs = lambda : [
493
+ gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps),
494
+ gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words),
495
  ]
496
 
497
  is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
498
 
499
  simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
500
+ description=ui_description, article=ui_article, inputs=[
501
+ *common_inputs(),
502
+ *common_vad_inputs(),
503
+ *common_word_timestamps_inputs(),
504
+ ], outputs=[
505
  gr.File(label="Download"),
506
  gr.Text(label="Transcription"),
507
  gr.Text(label="Segments")
 
511
 
512
  full_transcribe = gr.Interface(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
513
  description=full_description, article=ui_article, inputs=[
514
+ *common_inputs(),
515
+
516
+ *common_vad_inputs(),
517
+ gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
518
+ gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
519
  gr.Dropdown(choices=["prepend_first_segment", "prepend_all_segments"], value=app_config.vad_initial_prompt_mode, label="VAD - Initial Prompt Mode"),
520
+
521
+ *common_word_timestamps_inputs(),
522
+ gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
523
+ gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
524
+
525
  gr.TextArea(label="Initial Prompt"),
526
  gr.Number(label="Temperature", value=app_config.temperature),
527
  gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
 
535
  gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
536
  gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
537
  gr.Number(label="No speech threshold", value=app_config.no_speech_threshold),
 
 
 
 
 
 
 
538
  ], outputs=[
539
  gr.File(label="Download"),
540
  gr.Text(label="Transcription"),