Spaces:
Sleeping
Fixed the default option value for Length Penalty in Whisper Advanced and made some adjustments in the code.
Browse files1. The Length Penalty option in Whisper Advanced shows 0 on the web UI, but in reality, the default value for this parameter in the config is None. The display of 0 on the web UI is actually a bug in the Gradio program. Now, this parameter will be shown as blank on the web UI to avoid affecting the execution results of the Whisper model.
2. When leaving the parameter options empty in Whisper Advanced, the parameter will be removed from decodeOptions, indicating that the model will not use this parameter.
3. Additional results, including temperature, avg_logprob, compression_ratio, and no_speech_prob, are now included in the segments returned by FasterWhisper.
4. The Whisper large-v1 model is now included in the config.
5. The translation model now supports operation under Word Timestamps, except for the Highlight Words mode, which still cannot be used with the translation model.
- app.py +29 -27
- config.json5 +4 -0
- docs/options.md +30 -1
- src/vad.py +6 -2
- src/whisper/fasterWhisperContainer.py +5 -1
@@ -265,19 +265,22 @@ class WhisperTranscriber:
|
|
265 |
if whisperNoRepeatNgramSize is not None and whisperNoRepeatNgramSize <= 1:
|
266 |
decodeOptions.pop("no_repeat_ngram_size")
|
267 |
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
#
|
273 |
-
#
|
274 |
-
#
|
275 |
-
#
|
276 |
-
#
|
277 |
-
#
|
278 |
-
#
|
279 |
-
#
|
280 |
-
#
|
|
|
|
|
|
|
281 |
|
282 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
|
283 |
|
@@ -378,7 +381,7 @@ class WhisperTranscriber:
|
|
378 |
|
379 |
# Transcribe
|
380 |
result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
|
381 |
-
if whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
|
382 |
whisperLang = get_lang_from_whisper_code(result["language"])
|
383 |
translationModel.whisperLang = whisperLang
|
384 |
|
@@ -407,7 +410,7 @@ class WhisperTranscriber:
|
|
407 |
out = ffmpeg.output(input_file, input_srt, output_with_srt, vcodec='copy', acodec='copy', scodec='mov_text')
|
408 |
outRsult = out.run(overwrite_output=True)
|
409 |
except Exception as e:
|
410 |
-
|
411 |
print("Error merge subtitle with source file: \n" + source.source_path + ", \n" + str(e), outRsult)
|
412 |
elif self.app_config.save_downloaded_files and self.app_config.output_dir is not None and urlData:
|
413 |
print("Saving downloaded file [" + source.source_name + "]")
|
@@ -415,7 +418,7 @@ class WhisperTranscriber:
|
|
415 |
save_path = os.path.join(self.app_config.output_dir, filePrefix)
|
416 |
shutil.copy(source.source_path, save_path + suffix)
|
417 |
except Exception as e:
|
418 |
-
|
419 |
print("Error saving downloaded file: \n" + source.source_path + ", \n" + str(e))
|
420 |
|
421 |
if len(sources) > 1:
|
@@ -467,7 +470,7 @@ class WhisperTranscriber:
|
|
467 |
try:
|
468 |
os.remove(source.source_path)
|
469 |
except Exception as e:
|
470 |
-
|
471 |
print("Error deleting temporary source file: \n" + source.source_path + ", \n" + str(e))
|
472 |
|
473 |
except ExceededMaximumDuration as e:
|
@@ -613,7 +616,7 @@ class WhisperTranscriber:
|
|
613 |
def _create_silero_config(self, non_speech_strategy: NonSpeechStrategy, vadOptions: VadOptions):
|
614 |
# Use Silero VAD
|
615 |
if (self.vad_model is None):
|
616 |
-
self.vad_model = VadSileroTranscription()
|
617 |
|
618 |
config = TranscriptionConfig(non_speech_strategy = non_speech_strategy,
|
619 |
max_silent_period=vadOptions.vadMergeWindow, max_merge_size=vadOptions.vadMaxMergeSize,
|
@@ -655,7 +658,6 @@ class WhisperTranscriber:
|
|
655 |
|
656 |
print("\n\nprocess segments took {} seconds.\n\n".format(perf_end_time - perf_start_time))
|
657 |
except Exception as e:
|
658 |
-
# Ignore error - it's just a cleanup
|
659 |
print(traceback.format_exc())
|
660 |
print("Error process segments: " + str(e))
|
661 |
|
@@ -812,13 +814,13 @@ def create_ui(app_config: ApplicationConfig):
|
|
812 |
|
813 |
uiArticle = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
|
814 |
uiArticle += "\n\nWhisper's Task 'translate' only implements the functionality of translating other languages into English. "
|
815 |
-
uiArticle += "OpenAI does not guarantee translations between arbitrary languages. In such cases, you can choose to use the
|
816 |
-
uiArticle += "However, it's important to note that the
|
817 |
-
uiArticle += "\n\nThe larger the parameters of the
|
818 |
uiArticle += "However, it also requires higher computational resources, making it slower to operate. "
|
819 |
-
uiArticle += "On the other hand, the version converted from ct2 (CTranslate2) requires lower resources and operates at a faster speed."
|
820 |
-
uiArticle += "\n\nCurrently, enabling
|
821 |
-
uiArticle += "because
|
822 |
uiArticle += "\n\nThe 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. "
|
823 |
uiArticle += "This model has a relatively good translation speed, but it only supports three languages: Chinese, Japanese, and English. "
|
824 |
|
@@ -942,7 +944,7 @@ def create_ui(app_config: ApplicationConfig):
|
|
942 |
|
943 |
fullInputDict = {}
|
944 |
fullDescription = uiDescription + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
|
945 |
-
|
946 |
with gr.Blocks() as fullTranscribe:
|
947 |
fullTranslateInput = gr.State(value="m2m100", elem_id = "translateInput")
|
948 |
fullSourceInput = gr.State(value="urlData", elem_id = "sourceInput")
|
@@ -994,7 +996,7 @@ def create_ui(app_config: ApplicationConfig):
|
|
994 |
gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of"),
|
995 |
gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size"),
|
996 |
gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience"),
|
997 |
-
gr.Number(label="Length Penalty - Any temperature", value=app_config.length_penalty, elem_id = "length_penalty"),
|
998 |
gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens"),
|
999 |
gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text"),
|
1000 |
gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16"),
|
|
|
265 |
if whisperNoRepeatNgramSize is not None and whisperNoRepeatNgramSize <= 1:
|
266 |
decodeOptions.pop("no_repeat_ngram_size")
|
267 |
|
268 |
+
for key, value in list(decodeOptions.items()):
|
269 |
+
if value == "":
|
270 |
+
del decodeOptions[key]
|
271 |
+
|
272 |
+
# word_timestamps = decodeOptions.get("word_timestamps", False)
|
273 |
+
# condition_on_previous_text = decodeOptions.get("condition_on_previous_text", False)
|
274 |
+
# prepend_punctuations = decodeOptions.get("prepend_punctuations", None)
|
275 |
+
# append_punctuations = decodeOptions.get("append_punctuations", None)
|
276 |
+
# initial_prompt = decodeOptions.get("initial_prompt", None)
|
277 |
+
# best_of = decodeOptions.get("best_of", None)
|
278 |
+
# beam_size = decodeOptions.get("beam_size", None)
|
279 |
+
# patience = decodeOptions.get("patience", None)
|
280 |
+
# length_penalty = decodeOptions.get("length_penalty", None)
|
281 |
+
# suppress_tokens = decodeOptions.get("suppress_tokens", None)
|
282 |
+
# compression_ratio_threshold = decodeOptions.get("compression_ratio_threshold", None)
|
283 |
+
# logprob_threshold = decodeOptions.get("logprob_threshold", None)
|
284 |
|
285 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
|
286 |
|
|
|
381 |
|
382 |
# Transcribe
|
383 |
result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
|
384 |
+
if translationModel is not None and whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
|
385 |
whisperLang = get_lang_from_whisper_code(result["language"])
|
386 |
translationModel.whisperLang = whisperLang
|
387 |
|
|
|
410 |
out = ffmpeg.output(input_file, input_srt, output_with_srt, vcodec='copy', acodec='copy', scodec='mov_text')
|
411 |
outRsult = out.run(overwrite_output=True)
|
412 |
except Exception as e:
|
413 |
+
print(traceback.format_exc())
|
414 |
print("Error merge subtitle with source file: \n" + source.source_path + ", \n" + str(e), outRsult)
|
415 |
elif self.app_config.save_downloaded_files and self.app_config.output_dir is not None and urlData:
|
416 |
print("Saving downloaded file [" + source.source_name + "]")
|
|
|
418 |
save_path = os.path.join(self.app_config.output_dir, filePrefix)
|
419 |
shutil.copy(source.source_path, save_path + suffix)
|
420 |
except Exception as e:
|
421 |
+
print(traceback.format_exc())
|
422 |
print("Error saving downloaded file: \n" + source.source_path + ", \n" + str(e))
|
423 |
|
424 |
if len(sources) > 1:
|
|
|
470 |
try:
|
471 |
os.remove(source.source_path)
|
472 |
except Exception as e:
|
473 |
+
print(traceback.format_exc())
|
474 |
print("Error deleting temporary source file: \n" + source.source_path + ", \n" + str(e))
|
475 |
|
476 |
except ExceededMaximumDuration as e:
|
|
|
616 |
def _create_silero_config(self, non_speech_strategy: NonSpeechStrategy, vadOptions: VadOptions):
|
617 |
# Use Silero VAD
|
618 |
if (self.vad_model is None):
|
619 |
+
self.vad_model = VadSileroTranscription() #vad_model is snakers4/silero-vad
|
620 |
|
621 |
config = TranscriptionConfig(non_speech_strategy = non_speech_strategy,
|
622 |
max_silent_period=vadOptions.vadMergeWindow, max_merge_size=vadOptions.vadMaxMergeSize,
|
|
|
658 |
|
659 |
print("\n\nprocess segments took {} seconds.\n\n".format(perf_end_time - perf_start_time))
|
660 |
except Exception as e:
|
|
|
661 |
print(traceback.format_exc())
|
662 |
print("Error process segments: " + str(e))
|
663 |
|
|
|
814 |
|
815 |
uiArticle = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
|
816 |
uiArticle += "\n\nWhisper's Task 'translate' only implements the functionality of translating other languages into English. "
|
817 |
+
uiArticle += "OpenAI does not guarantee translations between arbitrary languages. In such cases, you can choose to use the Translation Model to implement the translation task. "
|
818 |
+
uiArticle += "However, it's important to note that the Translation Model runs slowly(in CPU), and the completion time may be twice as long as usual. "
|
819 |
+
uiArticle += "\n\nThe larger the parameters of the Translation model, the better its performance is expected to be. "
|
820 |
uiArticle += "However, it also requires higher computational resources, making it slower to operate. "
|
821 |
+
uiArticle += "On the other hand, the version converted from ct2 ([CTranslate2](https://opennmt.net/CTranslate2/guides/transformers.html)) requires lower resources and operates at a faster speed."
|
822 |
+
uiArticle += "\n\nCurrently, enabling `Highlight Words` timestamps cannot be used in conjunction with Translation Model translation "
|
823 |
+
uiArticle += "because Highlight Words will split the source text, and after translation, it becomes a non-word-level string. "
|
824 |
uiArticle += "\n\nThe 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. "
|
825 |
uiArticle += "This model has a relatively good translation speed, but it only supports three languages: Chinese, Japanese, and English. "
|
826 |
|
|
|
944 |
|
945 |
fullInputDict = {}
|
946 |
fullDescription = uiDescription + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
|
947 |
+
|
948 |
with gr.Blocks() as fullTranscribe:
|
949 |
fullTranslateInput = gr.State(value="m2m100", elem_id = "translateInput")
|
950 |
fullSourceInput = gr.State(value="urlData", elem_id = "sourceInput")
|
|
|
996 |
gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of"),
|
997 |
gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size"),
|
998 |
gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience"),
|
999 |
+
gr.Number(label="Length Penalty - Any temperature", value=lambda : None if app_config.length_penalty is None else app_config.length_penalty, elem_id = "length_penalty"),
|
1000 |
gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens"),
|
1001 |
gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text"),
|
1002 |
gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16"),
|
@@ -23,6 +23,10 @@
|
|
23 |
"name": "large",
|
24 |
"url": "large"
|
25 |
},
|
|
|
|
|
|
|
|
|
26 |
{
|
27 |
"name": "large-v2",
|
28 |
"url": "large-v2"
|
|
|
23 |
"name": "large",
|
24 |
"url": "large"
|
25 |
},
|
26 |
+
{
|
27 |
+
"name": "large-v1",
|
28 |
+
"url": "large-v1"
|
29 |
+
},
|
30 |
{
|
31 |
"name": "large-v2",
|
32 |
"url": "large-v2"
|
@@ -16,6 +16,7 @@ Select the model that Whisper will use to transcribe the audio:
|
|
16 |
| medium | 769 M | medium.en | medium | ~5 GB | ~2x |
|
17 |
| large | 1550 M | N/A | large | ~10 GB | 1x |
|
18 |
| large-v2 | 1550 M | N/A | large | ~10 GB | 1x |
|
|
|
19 |
|
20 |
## Language
|
21 |
|
@@ -150,4 +151,32 @@ The minimum number of speakers for Pyannote to detect.
|
|
150 |
|
151 |
## Diarization - Max Speakers
|
152 |
|
153 |
-
The maximum number of speakers for Pyannote to detect.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
| medium | 769 M | medium.en | medium | ~5 GB | ~2x |
|
17 |
| large | 1550 M | N/A | large | ~10 GB | 1x |
|
18 |
| large-v2 | 1550 M | N/A | large | ~10 GB | 1x |
|
19 |
+
| large-v3 | 1550 M | N/A | large | ~10 GB | 1x |
|
20 |
|
21 |
## Language
|
22 |
|
|
|
151 |
|
152 |
## Diarization - Max Speakers
|
153 |
|
154 |
+
The maximum number of speakers for Pyannote to detect.
|
155 |
+
|
156 |
+
## Repetition Penalty
|
157 |
+
- ctranslate2: repetition_penalty
|
158 |
+
This parameter only takes effect in [faster-whisper (ctranslate2)](https://github.com/SYSTRAN/faster-whisper/issues/478).
|
159 |
+
Penalty applied to the score of previously generated tokens (set > 1 to penalize).
|
160 |
+
|
161 |
+
## No Repeat Ngram Size
|
162 |
+
- ctranslate2: no_repeat_ngram_size
|
163 |
+
This parameter only takes effect in [faster-whisper (ctranslate2)](https://github.com/SYSTRAN/faster-whisper/issues/478).
|
164 |
+
Prevent repetitions of ngrams with this size (set 0 to disable).
|
165 |
+
|
166 |
+
## Translation - Batch Size
|
167 |
+
- transformers: batch_size
|
168 |
+
When the pipeline will use DataLoader (when passing a dataset, on GPU for a Pytorch model), the size of the batch to use, for inference this is not always beneficial.
|
169 |
+
- ctranslate2: max_batch_size
|
170 |
+
The maximum batch size.
|
171 |
+
|
172 |
+
## Translation - No Repeat Ngram Size
|
173 |
+
- transformers: no_repeat_ngram_size
|
174 |
+
Value that will be used by default in the generate method of the model for no_repeat_ngram_size. If set to int > 0, all ngrams of that size can only occur once.
|
175 |
+
- ctranslate2: no_repeat_ngram_size
|
176 |
+
Prevent repetitions of ngrams with this size (set 0 to disable).
|
177 |
+
|
178 |
+
## Translation - Num Beams
|
179 |
+
- transformers: num_beams
|
180 |
+
Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
|
181 |
+
- ctranslate2: beam_size
|
182 |
+
Beam size (1 for greedy search).
|
@@ -231,7 +231,7 @@ class AbstractTranscription(ABC):
|
|
231 |
# Add expand amount if the segment got expanded
|
232 |
if (adjusted_segment_end > segment_without_expansion):
|
233 |
adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
|
234 |
-
|
235 |
# Append to output
|
236 |
result['text'] += segment_result['text']
|
237 |
result['segments'].extend(adjusted_segments)
|
@@ -249,7 +249,7 @@ class AbstractTranscription(ABC):
|
|
249 |
if progressListener is not None:
|
250 |
progressListener.on_finished()
|
251 |
return result
|
252 |
-
|
253 |
def get_audio_duration(self, audio: str, config: TranscriptionConfig):
|
254 |
return get_audio_duration(audio)
|
255 |
|
@@ -449,6 +449,10 @@ class VadSileroTranscription(AbstractTranscription):
|
|
449 |
print("Created Silerio model")
|
450 |
|
451 |
def _create_model(self):
|
|
|
|
|
|
|
|
|
452 |
repo_owner = "snakers4"
|
453 |
repo_name = "silero-vad"
|
454 |
ref = "master"
|
|
|
231 |
# Add expand amount if the segment got expanded
|
232 |
if (adjusted_segment_end > segment_without_expansion):
|
233 |
adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
|
234 |
+
|
235 |
# Append to output
|
236 |
result['text'] += segment_result['text']
|
237 |
result['segments'].extend(adjusted_segments)
|
|
|
249 |
if progressListener is not None:
|
250 |
progressListener.on_finished()
|
251 |
return result
|
252 |
+
|
253 |
def get_audio_duration(self, audio: str, config: TranscriptionConfig):
|
254 |
return get_audio_duration(audio)
|
255 |
|
|
|
449 |
print("Created Silerio model")
|
450 |
|
451 |
def _create_model(self):
|
452 |
+
"""
|
453 |
+
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
|
454 |
+
https://github.com/snakers4/silero-vad
|
455 |
+
"""
|
456 |
repo_owner = "snakers4"
|
457 |
repo_name = "silero-vad"
|
458 |
ref = "master"
|
@@ -160,13 +160,17 @@ class FasterWhisperCallback(AbstractWhisperCallback):
|
|
160 |
"text": segment.text,
|
161 |
"start": segment.start,
|
162 |
"end": segment.end,
|
|
|
|
|
|
|
|
|
163 |
|
164 |
# Extra fields added by faster-whisper
|
165 |
"words": [{
|
166 |
"start": word.start,
|
167 |
"end": word.end,
|
168 |
"word": word.word,
|
169 |
-
"probability": word.probability
|
170 |
} for word in (segment.words if segment.words is not None else []) ]
|
171 |
} for segment in segments]
|
172 |
|
|
|
160 |
"text": segment.text,
|
161 |
"start": segment.start,
|
162 |
"end": segment.end,
|
163 |
+
"temperature": segment.temperature,
|
164 |
+
"avg_logprob": segment.avg_logprob,
|
165 |
+
"compression_ratio": segment.compression_ratio,
|
166 |
+
"no_speech_prob": segment.no_speech_prob,
|
167 |
|
168 |
# Extra fields added by faster-whisper
|
169 |
"words": [{
|
170 |
"start": word.start,
|
171 |
"end": word.end,
|
172 |
"word": word.word,
|
173 |
+
"probability": word.probability,
|
174 |
} for word in (segment.words if segment.words is not None else []) ]
|
175 |
} for segment in segments]
|
176 |
|