avans06 commited on
Commit
50167d4
1 Parent(s): 2601bfd

Fixed the default option value for Length Penalty in Whisper Advanced and made some adjustments in the code.

Browse files

1. The Length Penalty option in Whisper Advanced shows 0 on the web UI, but in reality, the default value for this parameter in the config is None. The display of 0 on the web UI is actually a bug in the Gradio program. Now, this parameter will be shown as blank on the web UI to avoid affecting the execution results of the Whisper model.

2. When leaving the parameter options empty in Whisper Advanced, the parameter will be removed from decodeOptions, indicating that the model will not use this parameter.

3. Additional results, including temperature, avg_logprob, compression_ratio, and no_speech_prob, are now included in the segments returned by FasterWhisper.

4. The Whisper large-v1 model is now included in the config.

5. The translation model now supports operation under Word Timestamps, except for the Highlight Words mode, which still cannot be used with the translation model.

Files changed (5) hide show
  1. app.py +29 -27
  2. config.json5 +4 -0
  3. docs/options.md +30 -1
  4. src/vad.py +6 -2
  5. src/whisper/fasterWhisperContainer.py +5 -1
app.py CHANGED
@@ -265,19 +265,22 @@ class WhisperTranscriber:
265
  if whisperNoRepeatNgramSize is not None and whisperNoRepeatNgramSize <= 1:
266
  decodeOptions.pop("no_repeat_ngram_size")
267
 
268
- # word_timestamps = options.get("word_timestamps", False)
269
- # condition_on_previous_text = options.get("condition_on_previous_text", False)
270
-
271
- # prepend_punctuations = options.get("prepend_punctuations", None)
272
- # append_punctuations = options.get("append_punctuations", None)
273
- # initial_prompt = options.get("initial_prompt", None)
274
- # best_of = options.get("best_of", None)
275
- # beam_size = options.get("beam_size", None)
276
- # patience = options.get("patience", None)
277
- # length_penalty = options.get("length_penalty", None)
278
- # suppress_tokens = options.get("suppress_tokens", None)
279
- # compression_ratio_threshold = options.get("compression_ratio_threshold", None)
280
- # logprob_threshold = options.get("logprob_threshold", None)
 
 
 
281
 
282
  vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
283
 
@@ -378,7 +381,7 @@ class WhisperTranscriber:
378
 
379
  # Transcribe
380
  result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
381
- if whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
382
  whisperLang = get_lang_from_whisper_code(result["language"])
383
  translationModel.whisperLang = whisperLang
384
 
@@ -407,7 +410,7 @@ class WhisperTranscriber:
407
  out = ffmpeg.output(input_file, input_srt, output_with_srt, vcodec='copy', acodec='copy', scodec='mov_text')
408
  outRsult = out.run(overwrite_output=True)
409
  except Exception as e:
410
- # Ignore error - it's just a cleanup
411
  print("Error merge subtitle with source file: \n" + source.source_path + ", \n" + str(e), outRsult)
412
  elif self.app_config.save_downloaded_files and self.app_config.output_dir is not None and urlData:
413
  print("Saving downloaded file [" + source.source_name + "]")
@@ -415,7 +418,7 @@ class WhisperTranscriber:
415
  save_path = os.path.join(self.app_config.output_dir, filePrefix)
416
  shutil.copy(source.source_path, save_path + suffix)
417
  except Exception as e:
418
- # Ignore error - it's just a cleanup
419
  print("Error saving downloaded file: \n" + source.source_path + ", \n" + str(e))
420
 
421
  if len(sources) > 1:
@@ -467,7 +470,7 @@ class WhisperTranscriber:
467
  try:
468
  os.remove(source.source_path)
469
  except Exception as e:
470
- # Ignore error - it's just a cleanup
471
  print("Error deleting temporary source file: \n" + source.source_path + ", \n" + str(e))
472
 
473
  except ExceededMaximumDuration as e:
@@ -613,7 +616,7 @@ class WhisperTranscriber:
613
  def _create_silero_config(self, non_speech_strategy: NonSpeechStrategy, vadOptions: VadOptions):
614
  # Use Silero VAD
615
  if (self.vad_model is None):
616
- self.vad_model = VadSileroTranscription()
617
 
618
  config = TranscriptionConfig(non_speech_strategy = non_speech_strategy,
619
  max_silent_period=vadOptions.vadMergeWindow, max_merge_size=vadOptions.vadMaxMergeSize,
@@ -655,7 +658,6 @@ class WhisperTranscriber:
655
 
656
  print("\n\nprocess segments took {} seconds.\n\n".format(perf_end_time - perf_start_time))
657
  except Exception as e:
658
- # Ignore error - it's just a cleanup
659
  print(traceback.format_exc())
660
  print("Error process segments: " + str(e))
661
 
@@ -812,13 +814,13 @@ def create_ui(app_config: ApplicationConfig):
812
 
813
  uiArticle = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
814
  uiArticle += "\n\nWhisper's Task 'translate' only implements the functionality of translating other languages into English. "
815
- uiArticle += "OpenAI does not guarantee translations between arbitrary languages. In such cases, you can choose to use the NLLB Model to implement the translation task. "
816
- uiArticle += "However, it's important to note that the NLLB Model runs slowly, and the completion time may be twice as long as usual. "
817
- uiArticle += "\n\nThe larger the parameters of the NLLB model, the better its performance is expected to be. "
818
  uiArticle += "However, it also requires higher computational resources, making it slower to operate. "
819
- uiArticle += "On the other hand, the version converted from ct2 (CTranslate2) requires lower resources and operates at a faster speed."
820
- uiArticle += "\n\nCurrently, enabling word-level timestamps cannot be used in conjunction with NLLB Model translation "
821
- uiArticle += "because Word Timestamps will split the source text, and after translation, it becomes a non-word-level string. "
822
  uiArticle += "\n\nThe 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. "
823
  uiArticle += "This model has a relatively good translation speed, but it only supports three languages: Chinese, Japanese, and English. "
824
 
@@ -942,7 +944,7 @@ def create_ui(app_config: ApplicationConfig):
942
 
943
  fullInputDict = {}
944
  fullDescription = uiDescription + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
945
-
946
  with gr.Blocks() as fullTranscribe:
947
  fullTranslateInput = gr.State(value="m2m100", elem_id = "translateInput")
948
  fullSourceInput = gr.State(value="urlData", elem_id = "sourceInput")
@@ -994,7 +996,7 @@ def create_ui(app_config: ApplicationConfig):
994
  gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of"),
995
  gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size"),
996
  gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience"),
997
- gr.Number(label="Length Penalty - Any temperature", value=app_config.length_penalty, elem_id = "length_penalty"),
998
  gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens"),
999
  gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text"),
1000
  gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16"),
 
265
  if whisperNoRepeatNgramSize is not None and whisperNoRepeatNgramSize <= 1:
266
  decodeOptions.pop("no_repeat_ngram_size")
267
 
268
+ for key, value in list(decodeOptions.items()):
269
+ if value == "":
270
+ del decodeOptions[key]
271
+
272
+ # word_timestamps = decodeOptions.get("word_timestamps", False)
273
+ # condition_on_previous_text = decodeOptions.get("condition_on_previous_text", False)
274
+ # prepend_punctuations = decodeOptions.get("prepend_punctuations", None)
275
+ # append_punctuations = decodeOptions.get("append_punctuations", None)
276
+ # initial_prompt = decodeOptions.get("initial_prompt", None)
277
+ # best_of = decodeOptions.get("best_of", None)
278
+ # beam_size = decodeOptions.get("beam_size", None)
279
+ # patience = decodeOptions.get("patience", None)
280
+ # length_penalty = decodeOptions.get("length_penalty", None)
281
+ # suppress_tokens = decodeOptions.get("suppress_tokens", None)
282
+ # compression_ratio_threshold = decodeOptions.get("compression_ratio_threshold", None)
283
+ # logprob_threshold = decodeOptions.get("logprob_threshold", None)
284
 
285
  vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
286
 
 
381
 
382
  # Transcribe
383
  result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
384
+ if translationModel is not None and whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
385
  whisperLang = get_lang_from_whisper_code(result["language"])
386
  translationModel.whisperLang = whisperLang
387
 
 
410
  out = ffmpeg.output(input_file, input_srt, output_with_srt, vcodec='copy', acodec='copy', scodec='mov_text')
411
  outRsult = out.run(overwrite_output=True)
412
  except Exception as e:
413
+ print(traceback.format_exc())
414
  print("Error merge subtitle with source file: \n" + source.source_path + ", \n" + str(e), outRsult)
415
  elif self.app_config.save_downloaded_files and self.app_config.output_dir is not None and urlData:
416
  print("Saving downloaded file [" + source.source_name + "]")
 
418
  save_path = os.path.join(self.app_config.output_dir, filePrefix)
419
  shutil.copy(source.source_path, save_path + suffix)
420
  except Exception as e:
421
+ print(traceback.format_exc())
422
  print("Error saving downloaded file: \n" + source.source_path + ", \n" + str(e))
423
 
424
  if len(sources) > 1:
 
470
  try:
471
  os.remove(source.source_path)
472
  except Exception as e:
473
+ print(traceback.format_exc())
474
  print("Error deleting temporary source file: \n" + source.source_path + ", \n" + str(e))
475
 
476
  except ExceededMaximumDuration as e:
 
616
  def _create_silero_config(self, non_speech_strategy: NonSpeechStrategy, vadOptions: VadOptions):
617
  # Use Silero VAD
618
  if (self.vad_model is None):
619
+ self.vad_model = VadSileroTranscription() #vad_model is snakers4/silero-vad
620
 
621
  config = TranscriptionConfig(non_speech_strategy = non_speech_strategy,
622
  max_silent_period=vadOptions.vadMergeWindow, max_merge_size=vadOptions.vadMaxMergeSize,
 
658
 
659
  print("\n\nprocess segments took {} seconds.\n\n".format(perf_end_time - perf_start_time))
660
  except Exception as e:
 
661
  print(traceback.format_exc())
662
  print("Error process segments: " + str(e))
663
 
 
814
 
815
  uiArticle = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
816
  uiArticle += "\n\nWhisper's Task 'translate' only implements the functionality of translating other languages into English. "
817
+ uiArticle += "OpenAI does not guarantee translations between arbitrary languages. In such cases, you can choose to use the Translation Model to implement the translation task. "
818
+ uiArticle += "However, it's important to note that the Translation Model runs slowly(in CPU), and the completion time may be twice as long as usual. "
819
+ uiArticle += "\n\nThe larger the parameters of the Translation model, the better its performance is expected to be. "
820
  uiArticle += "However, it also requires higher computational resources, making it slower to operate. "
821
+ uiArticle += "On the other hand, the version converted from ct2 ([CTranslate2](https://opennmt.net/CTranslate2/guides/transformers.html)) requires lower resources and operates at a faster speed."
822
+ uiArticle += "\n\nCurrently, enabling `Highlight Words` timestamps cannot be used in conjunction with Translation Model translation "
823
+ uiArticle += "because Highlight Words will split the source text, and after translation, it becomes a non-word-level string. "
824
  uiArticle += "\n\nThe 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. "
825
  uiArticle += "This model has a relatively good translation speed, but it only supports three languages: Chinese, Japanese, and English. "
826
 
 
944
 
945
  fullInputDict = {}
946
  fullDescription = uiDescription + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
947
+
948
  with gr.Blocks() as fullTranscribe:
949
  fullTranslateInput = gr.State(value="m2m100", elem_id = "translateInput")
950
  fullSourceInput = gr.State(value="urlData", elem_id = "sourceInput")
 
996
  gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of"),
997
  gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size"),
998
  gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience"),
999
+ gr.Number(label="Length Penalty - Any temperature", value=lambda : None if app_config.length_penalty is None else app_config.length_penalty, elem_id = "length_penalty"),
1000
  gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens"),
1001
  gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text"),
1002
  gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16"),
config.json5 CHANGED
@@ -23,6 +23,10 @@
23
  "name": "large",
24
  "url": "large"
25
  },
 
 
 
 
26
  {
27
  "name": "large-v2",
28
  "url": "large-v2"
 
23
  "name": "large",
24
  "url": "large"
25
  },
26
+ {
27
+ "name": "large-v1",
28
+ "url": "large-v1"
29
+ },
30
  {
31
  "name": "large-v2",
32
  "url": "large-v2"
docs/options.md CHANGED
@@ -16,6 +16,7 @@ Select the model that Whisper will use to transcribe the audio:
16
  | medium | 769 M | medium.en | medium | ~5 GB | ~2x |
17
  | large | 1550 M | N/A | large | ~10 GB | 1x |
18
  | large-v2 | 1550 M | N/A | large | ~10 GB | 1x |
 
19
 
20
  ## Language
21
 
@@ -150,4 +151,32 @@ The minimum number of speakers for Pyannote to detect.
150
 
151
  ## Diarization - Max Speakers
152
 
153
- The maximum number of speakers for Pyannote to detect.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  | medium | 769 M | medium.en | medium | ~5 GB | ~2x |
17
  | large | 1550 M | N/A | large | ~10 GB | 1x |
18
  | large-v2 | 1550 M | N/A | large | ~10 GB | 1x |
19
+ | large-v3 | 1550 M | N/A | large | ~10 GB | 1x |
20
 
21
  ## Language
22
 
 
151
 
152
  ## Diarization - Max Speakers
153
 
154
+ The maximum number of speakers for Pyannote to detect.
155
+
156
+ ## Repetition Penalty
157
+ - ctranslate2: repetition_penalty
158
+ This parameter only takes effect in [faster-whisper (ctranslate2)](https://github.com/SYSTRAN/faster-whisper/issues/478).
159
+ Penalty applied to the score of previously generated tokens (set > 1 to penalize).
160
+
161
+ ## No Repeat Ngram Size
162
+ - ctranslate2: no_repeat_ngram_size
163
+ This parameter only takes effect in [faster-whisper (ctranslate2)](https://github.com/SYSTRAN/faster-whisper/issues/478).
164
+ Prevent repetitions of ngrams with this size (set 0 to disable).
165
+
166
+ ## Translation - Batch Size
167
+ - transformers: batch_size
168
+ When the pipeline will use DataLoader (when passing a dataset, on GPU for a Pytorch model), the size of the batch to use, for inference this is not always beneficial.
169
+ - ctranslate2: max_batch_size
170
+ The maximum batch size.
171
+
172
+ ## Translation - No Repeat Ngram Size
173
+ - transformers: no_repeat_ngram_size
174
+ Value that will be used by default in the generate method of the model for no_repeat_ngram_size. If set to int > 0, all ngrams of that size can only occur once.
175
+ - ctranslate2: no_repeat_ngram_size
176
+ Prevent repetitions of ngrams with this size (set 0 to disable).
177
+
178
+ ## Translation - Num Beams
179
+ - transformers: num_beams
180
+ Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
181
+ - ctranslate2: beam_size
182
+ Beam size (1 for greedy search).
src/vad.py CHANGED
@@ -231,7 +231,7 @@ class AbstractTranscription(ABC):
231
  # Add expand amount if the segment got expanded
232
  if (adjusted_segment_end > segment_without_expansion):
233
  adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
234
-
235
  # Append to output
236
  result['text'] += segment_result['text']
237
  result['segments'].extend(adjusted_segments)
@@ -249,7 +249,7 @@ class AbstractTranscription(ABC):
249
  if progressListener is not None:
250
  progressListener.on_finished()
251
  return result
252
-
253
  def get_audio_duration(self, audio: str, config: TranscriptionConfig):
254
  return get_audio_duration(audio)
255
 
@@ -449,6 +449,10 @@ class VadSileroTranscription(AbstractTranscription):
449
  print("Created Silerio model")
450
 
451
  def _create_model(self):
 
 
 
 
452
  repo_owner = "snakers4"
453
  repo_name = "silero-vad"
454
  ref = "master"
 
231
  # Add expand amount if the segment got expanded
232
  if (adjusted_segment_end > segment_without_expansion):
233
  adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
234
+
235
  # Append to output
236
  result['text'] += segment_result['text']
237
  result['segments'].extend(adjusted_segments)
 
249
  if progressListener is not None:
250
  progressListener.on_finished()
251
  return result
252
+
253
  def get_audio_duration(self, audio: str, config: TranscriptionConfig):
254
  return get_audio_duration(audio)
255
 
 
449
  print("Created Silerio model")
450
 
451
  def _create_model(self):
452
+ """
453
+ (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
454
+ https://github.com/snakers4/silero-vad
455
+ """
456
  repo_owner = "snakers4"
457
  repo_name = "silero-vad"
458
  ref = "master"
src/whisper/fasterWhisperContainer.py CHANGED
@@ -160,13 +160,17 @@ class FasterWhisperCallback(AbstractWhisperCallback):
160
  "text": segment.text,
161
  "start": segment.start,
162
  "end": segment.end,
 
 
 
 
163
 
164
  # Extra fields added by faster-whisper
165
  "words": [{
166
  "start": word.start,
167
  "end": word.end,
168
  "word": word.word,
169
- "probability": word.probability
170
  } for word in (segment.words if segment.words is not None else []) ]
171
  } for segment in segments]
172
 
 
160
  "text": segment.text,
161
  "start": segment.start,
162
  "end": segment.end,
163
+ "temperature": segment.temperature,
164
+ "avg_logprob": segment.avg_logprob,
165
+ "compression_ratio": segment.compression_ratio,
166
+ "no_speech_prob": segment.no_speech_prob,
167
 
168
  # Extra fields added by faster-whisper
169
  "words": [{
170
  "start": word.start,
171
  "end": word.end,
172
  "word": word.word,
173
+ "probability": word.probability,
174
  } for word in (segment.words if segment.words is not None else []) ]
175
  } for segment in segments]
176