jhj0517 commited on
Commit
ed6e918
·
unverified ·
2 Parent(s): 8dc115b 84fd983

Merge pull request #341 from jhj0517/feature/support-i18n

Browse files
app.py CHANGED
@@ -1,11 +1,13 @@
1
  import os
2
  import argparse
3
  import gradio as gr
 
4
  import yaml
5
 
6
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
7
  INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
8
- UVR_MODELS_DIR)
 
9
  from modules.utils.files_manager import load_yaml
10
  from modules.whisper.whisper_factory import WhisperFactory
11
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
@@ -22,6 +24,7 @@ class App:
22
  def __init__(self, args):
23
  self.args = args
24
  self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
 
25
  self.whisper_inf = WhisperFactory.create_whisper_inference(
26
  whisper_type=self.args.whisper_type,
27
  whisper_model_dir=self.args.whisper_model_dir,
@@ -38,8 +41,8 @@ class App:
38
  output_dir=os.path.join(self.args.output_dir, "translations")
39
  )
40
  self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
41
- print(f"Use \"{self.args.whisper_type}\" implementation")
42
- print(f"Device \"{self.whisper_inf.device}\" is detected")
43
 
44
  def create_whisper_parameters(self):
45
  whisper_params = self.default_params["whisper"]
@@ -49,23 +52,28 @@ class App:
49
 
50
  with gr.Row():
51
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
52
- label="Model")
53
- dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
54
- value=whisper_params["lang"], label="Language")
55
- dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
 
56
  with gr.Row():
57
- cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English?",
58
  interactive=True)
59
  with gr.Row():
60
- cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename",
 
61
  interactive=True)
62
 
63
- with gr.Accordion("Advanced Parameters", open=False):
64
- nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
 
65
  info="Beam size to use for decoding.")
66
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=whisper_params["log_prob_threshold"], interactive=True,
 
67
  info="If the average log probability over sampled tokens is below this value, treat as failed.")
68
- nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], interactive=True,
 
69
  info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
70
  dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
71
  value=self.whisper_inf.current_compute_type, interactive=True,
@@ -75,10 +83,12 @@ class App:
75
  info="Number of candidates when sampling with non-zero temperature.")
76
  nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
77
  info="Beam search patience factor.")
78
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=whisper_params["condition_on_previous_text"],
 
79
  interactive=True,
80
  info="Condition on previous text during decoding.")
81
- sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=whisper_params["prompt_reset_on_temperature"],
 
82
  minimum=0, maximum=1, step=0.01, interactive=True,
83
  info="Resets prompt if temperature is above this value."
84
  " Arg has effect only if 'Condition On Previous Text' is True.")
@@ -87,7 +97,8 @@ class App:
87
  sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
88
  step=0.01, maximum=1.0, interactive=True,
89
  info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
90
- nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
 
91
  interactive=True,
92
  info="If the gzip compression ratio is above this value, treat as failed.")
93
  nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
@@ -96,9 +107,11 @@ class App:
96
  with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
97
  nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
98
  info="Exponential length penalty constant.")
99
- nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"],
 
100
  info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
101
- nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"],
 
102
  precision=0,
103
  info="Prevent repetitions of n-grams with this size (set 0 to disable).")
104
  tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
@@ -107,48 +120,55 @@ class App:
107
  info="Suppress blank outputs at the beginning of the sampling.")
108
  tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
109
  info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
110
- nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"],
 
111
  info="The initial timestamp cannot be later than this.")
112
  cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
113
  info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
114
- tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"],
 
115
  info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
116
- tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"],
 
117
  info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
118
  nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
119
  precision=0,
120
  info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
121
  nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
122
- value=lambda: whisper_params["hallucination_silence_threshold"],
 
123
  info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
124
  tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
125
  info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
126
- nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"],
 
 
127
  info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
128
- nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"],
 
129
  precision=0,
130
  info="Number of segments to consider for the language detection.")
131
  with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
132
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
133
 
134
- with gr.Accordion("Background Music Remover Filter", open=False):
135
- cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
 
136
  interactive=True,
137
- info="Enabling this will remove background music by submodel before"
138
- " transcribing ")
139
- dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
140
  choices=self.whisper_inf.music_separator.available_devices)
141
- dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
142
  choices=self.whisper_inf.music_separator.available_models)
143
  nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
144
- cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
145
- cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
146
  value=uvr_params["enable_offload"])
147
 
148
- with gr.Accordion("Voice Detection Filter", open=False):
149
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
150
  interactive=True,
151
- info="Enable this to transcribe only detected voice parts by submodel.")
152
  sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
153
  value=vad_params["threshold"],
154
  info="Lower it to be more sensitive to small sounds.")
@@ -165,15 +185,11 @@ class App:
165
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
166
  info="Final speech chunks are padded by this time each side")
167
 
168
- with gr.Accordion("Diarization", open=False):
169
- cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
170
- tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
171
- info="This is only needed the first time you download the model. If you already have"
172
- " models, you don't need to enter. To download the model, you must manually go "
173
- "to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and "
174
- "\"https://huggingface.co/pyannote/segmentation-3.0\" and agree to"
175
- " their requirement.")
176
- dd_diarization_device = gr.Dropdown(label="Device",
177
  choices=self.whisper_inf.diarizer.get_available_device(),
178
  value=self.whisper_inf.diarizer.get_device())
179
 
@@ -213,179 +229,191 @@ class App:
213
  uvr_params = self.default_params["bgm_separation"]
214
 
215
  with self.app:
216
- with gr.Row():
217
- with gr.Column():
218
- gr.Markdown(MARKDOWN, elem_id="md_project")
219
- with gr.Tabs():
220
- with gr.TabItem("File"): # tab1
221
  with gr.Column():
222
- input_file = gr.Files(type="filepath", label="Upload File here")
223
- tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
224
- info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
225
- " Leave this field empty if you do not wish to use a local path.",
226
- visible=self.args.colab,
227
- value="")
228
-
229
- whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
230
-
231
- with gr.Row():
232
- btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
233
- with gr.Row():
234
- tb_indicator = gr.Textbox(label="Output", scale=5)
235
- files_subtitles = gr.Files(label="Downloadable output file", scale=3, interactive=False)
236
- btn_openfolder = gr.Button('📂', scale=1)
237
-
238
- params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
239
- btn_run.click(fn=self.whisper_inf.transcribe_file,
240
- inputs=params + whisper_params.as_list(),
241
- outputs=[tb_indicator, files_subtitles])
242
- btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
243
-
244
- with gr.TabItem("Youtube"): # tab2
245
- with gr.Row():
246
- tb_youtubelink = gr.Textbox(label="Youtube Link")
247
- with gr.Row(equal_height=True):
248
  with gr.Column():
249
- img_thumbnail = gr.Image(label="Youtube Thumbnail")
250
- with gr.Column():
251
- tb_title = gr.Label(label="Youtube Title")
252
- tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
 
 
253
 
254
- whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
255
 
256
- with gr.Row():
257
- btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
258
- with gr.Row():
259
- tb_indicator = gr.Textbox(label="Output", scale=5)
260
- files_subtitles = gr.Files(label="Downloadable output file", scale=3)
261
- btn_openfolder = gr.Button('📂', scale=1)
262
 
263
- params = [tb_youtubelink, dd_file_format, cb_timestamp]
 
 
 
 
264
 
265
- btn_run.click(fn=self.whisper_inf.transcribe_youtube,
266
- inputs=params + whisper_params.as_list(),
267
- outputs=[tb_indicator, files_subtitles])
268
- tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
269
- outputs=[img_thumbnail, tb_title, tb_description])
270
- btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
 
 
 
271
 
272
- with gr.TabItem("Mic"): # tab3
273
- with gr.Row():
274
- mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
275
 
276
- whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
 
 
 
 
 
277
 
278
- with gr.Row():
279
- btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
280
- with gr.Row():
281
- tb_indicator = gr.Textbox(label="Output", scale=5)
282
- files_subtitles = gr.Files(label="Downloadable output file", scale=3)
283
- btn_openfolder = gr.Button('📂', scale=1)
284
 
285
- params = [mic_input, dd_file_format, cb_timestamp]
 
 
 
 
 
286
 
287
- btn_run.click(fn=self.whisper_inf.transcribe_mic,
288
- inputs=params + whisper_params.as_list(),
289
- outputs=[tb_indicator, files_subtitles])
290
- btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
291
 
292
- with gr.TabItem("T2T Translation"): # tab 4
293
- with gr.Row():
294
- file_subs = gr.Files(type="filepath", label="Upload Subtitle Files to translate here")
295
 
296
- with gr.TabItem("DeepL API"): # sub tab1
297
- with gr.Row():
298
- tb_api_key = gr.Textbox(label="Your Auth Key (API KEY)", value=deepl_params["api_key"])
299
- with gr.Row():
300
- dd_source_lang = gr.Dropdown(label="Source Language", value=deepl_params["source_lang"],
301
- choices=list(self.deepl_api.available_source_langs.keys()))
302
- dd_target_lang = gr.Dropdown(label="Target Language", value=deepl_params["target_lang"],
303
- choices=list(self.deepl_api.available_target_langs.keys()))
304
  with gr.Row():
305
- cb_is_pro = gr.Checkbox(label="Pro User?", value=deepl_params["is_pro"])
306
  with gr.Row():
307
- cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
308
- interactive=True)
309
- with gr.Row():
310
- btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
311
- with gr.Row():
312
- tb_indicator = gr.Textbox(label="Output", scale=5)
313
- files_subtitles = gr.Files(label="Downloadable output file", scale=3)
314
  btn_openfolder = gr.Button('📂', scale=1)
315
 
316
- btn_run.click(fn=self.deepl_api.translate_deepl,
317
- inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
318
- cb_is_pro, cb_timestamp],
319
- outputs=[tb_indicator, files_subtitles])
320
 
321
- btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
322
- inputs=None,
323
- outputs=None)
 
324
 
325
- with gr.TabItem("NLLB"): # sub tab2
326
- with gr.Row():
327
- dd_model_size = gr.Dropdown(label="Model", value=nllb_params["model_size"],
328
- choices=self.nllb_inf.available_models)
329
- dd_source_lang = gr.Dropdown(label="Source Language", value=nllb_params["source_lang"],
330
- choices=self.nllb_inf.available_source_langs)
331
- dd_target_lang = gr.Dropdown(label="Target Language", value=nllb_params["target_lang"],
332
- choices=self.nllb_inf.available_target_langs)
333
- with gr.Row():
334
- nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
335
- precision=0)
336
- with gr.Row():
337
- cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
338
- interactive=True)
339
  with gr.Row():
340
- btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
341
- with gr.Row():
342
- tb_indicator = gr.Textbox(label="Output", scale=5)
343
- files_subtitles = gr.Files(label="Downloadable output file", scale=3)
344
- btn_openfolder = gr.Button('📂', scale=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  with gr.Column():
346
- md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
347
-
348
- btn_run.click(fn=self.nllb_inf.translate_file,
349
- inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
350
- nb_max_length, cb_timestamp],
351
- outputs=[tb_indicator, files_subtitles])
352
-
353
- btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
354
- inputs=None,
355
- outputs=None)
356
-
357
- with gr.TabItem("BGM Separation"):
358
- files_audio = gr.Files(type="filepath", label="Upload Audio Files to separate background music")
359
- dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
360
- choices=self.whisper_inf.music_separator.available_devices)
361
- dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
362
- choices=self.whisper_inf.music_separator.available_models)
363
- nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
364
- cb_uvr_save_file = gr.Checkbox(label="Save separated files to output",
365
- value=True, visible=False)
366
- btn_run = gr.Button("SEPARATE BACKGROUND MUSIC", variant="primary")
367
- with gr.Column():
368
- with gr.Row():
369
- ad_instrumental = gr.Audio(label="Instrumental", scale=8)
370
- btn_open_instrumental_folder = gr.Button('📂', scale=1)
371
- with gr.Row():
372
- ad_vocals = gr.Audio(label="Vocals", scale=8)
373
- btn_open_vocals_folder = gr.Button('📂', scale=1)
374
-
375
- btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
376
- inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
377
- cb_uvr_save_file],
378
- outputs=[ad_instrumental, ad_vocals])
379
- btn_open_instrumental_folder.click(inputs=None,
380
- outputs=None,
381
- fn=lambda: self.open_folder(os.path.join(
382
- self.args.output_dir, "UVR", "instrumental"
383
- )))
384
- btn_open_vocals_folder.click(inputs=None,
385
- outputs=None,
386
- fn=lambda: self.open_folder(os.path.join(
387
- self.args.output_dir, "UVR", "vocals"
388
- )))
389
 
390
  # Launch the app with optional gradio settings
391
  args = self.args
@@ -418,10 +446,10 @@ class App:
418
  return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
419
 
420
 
421
- # Create the parser for command-line arguments
422
  parser = argparse.ArgumentParser()
423
  parser.add_argument('--whisper_type', type=str, default="faster-whisper",
424
- help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
 
425
  parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
426
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
427
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
@@ -430,8 +458,10 @@ parser.add_argument('--username', type=str, default=None, help='Gradio authentic
430
  parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
431
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
432
  parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
433
- parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
434
- parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
 
 
435
  parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
436
  help='Directory path of the whisper model')
437
  parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
 
1
  import os
2
  import argparse
3
  import gradio as gr
4
+ from gradio_i18n import Translate, gettext as _
5
  import yaml
6
 
7
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
8
  INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
9
+ UVR_MODELS_DIR, I18N_YAML_PATH)
10
+ from modules.utils.constants import AUTOMATIC_DETECTION
11
  from modules.utils.files_manager import load_yaml
12
  from modules.whisper.whisper_factory import WhisperFactory
13
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
 
24
  def __init__(self, args):
25
  self.args = args
26
  self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
27
+ self.i18n = Translate(I18N_YAML_PATH)
28
  self.whisper_inf = WhisperFactory.create_whisper_inference(
29
  whisper_type=self.args.whisper_type,
30
  whisper_model_dir=self.args.whisper_model_dir,
 
41
  output_dir=os.path.join(self.args.output_dir, "translations")
42
  )
43
  self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
44
+ print(f"Use \"{self.args.whisper_type}\" implementation\n"
45
+ f"Device \"{self.whisper_inf.device}\" is detected")
46
 
47
  def create_whisper_parameters(self):
48
  whisper_params = self.default_params["whisper"]
 
52
 
53
  with gr.Row():
54
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
55
+ label=_("Model"))
56
+ dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
57
+ value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
58
+ else whisper_params["lang"], label=_("Language"))
59
+ dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label=_("File Format"))
60
  with gr.Row():
61
+ cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
62
  interactive=True)
63
  with gr.Row():
64
+ cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"],
65
+ label=_("Add a timestamp to the end of the filename"),
66
  interactive=True)
67
 
68
+ with gr.Accordion(_("Advanced Parameters"), open=False):
69
+ nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0,
70
+ interactive=True,
71
  info="Beam size to use for decoding.")
72
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold",
73
+ value=whisper_params["log_prob_threshold"], interactive=True,
74
  info="If the average log probability over sampled tokens is below this value, treat as failed.")
75
+ nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"],
76
+ interactive=True,
77
  info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
78
  dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
79
  value=self.whisper_inf.current_compute_type, interactive=True,
 
83
  info="Number of candidates when sampling with non-zero temperature.")
84
  nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
85
  info="Beam search patience factor.")
86
+ cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text",
87
+ value=whisper_params["condition_on_previous_text"],
88
  interactive=True,
89
  info="Condition on previous text during decoding.")
90
+ sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature",
91
+ value=whisper_params["prompt_reset_on_temperature"],
92
  minimum=0, maximum=1, step=0.01, interactive=True,
93
  info="Resets prompt if temperature is above this value."
94
  " Arg has effect only if 'Condition On Previous Text' is True.")
 
97
  sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
98
  step=0.01, maximum=1.0, interactive=True,
99
  info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
100
+ nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold",
101
+ value=whisper_params["compression_ratio_threshold"],
102
  interactive=True,
103
  info="If the gzip compression ratio is above this value, treat as failed.")
104
  nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
 
107
  with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
108
  nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
109
  info="Exponential length penalty constant.")
110
+ nb_repetition_penalty = gr.Number(label="Repetition Penalty",
111
+ value=whisper_params["repetition_penalty"],
112
  info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
113
+ nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size",
114
+ value=whisper_params["no_repeat_ngram_size"],
115
  precision=0,
116
  info="Prevent repetitions of n-grams with this size (set 0 to disable).")
117
  tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
 
120
  info="Suppress blank outputs at the beginning of the sampling.")
121
  tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
122
  info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
123
+ nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp",
124
+ value=whisper_params["max_initial_timestamp"],
125
  info="The initial timestamp cannot be later than this.")
126
  cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
127
  info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
128
+ tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations",
129
+ value=whisper_params["prepend_punctuations"],
130
  info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
131
+ tb_append_punctuations = gr.Textbox(label="Append Punctuations",
132
+ value=whisper_params["append_punctuations"],
133
  info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
134
  nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
135
  precision=0,
136
  info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
137
  nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
138
+ value=lambda: whisper_params[
139
+ "hallucination_silence_threshold"],
140
  info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
141
  tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
142
  info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
143
+ nb_language_detection_threshold = gr.Number(label="Language Detection Threshold",
144
+ value=lambda: whisper_params[
145
+ "language_detection_threshold"],
146
  info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
147
+ nb_language_detection_segments = gr.Number(label="Language Detection Segments",
148
+ value=lambda: whisper_params["language_detection_segments"],
149
  precision=0,
150
  info="Number of segments to consider for the language detection.")
151
  with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
152
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
153
 
154
+ with gr.Accordion(_("Background Music Remover Filter"), open=False):
155
+ cb_bgm_separation = gr.Checkbox(label=_("Enable Background Music Remover Filter"),
156
+ value=uvr_params["is_separate_bgm"],
157
  interactive=True,
158
+ info=_("Enabling this will remove background music"))
159
+ dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
 
160
  choices=self.whisper_inf.music_separator.available_devices)
161
+ dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"],
162
  choices=self.whisper_inf.music_separator.available_models)
163
  nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
164
+ cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"), value=uvr_params["save_file"])
165
+ cb_uvr_enable_offload = gr.Checkbox(label=_("Offload sub model after removing background music"),
166
  value=uvr_params["enable_offload"])
167
 
168
+ with gr.Accordion(_("Voice Detection Filter"), open=False):
169
+ cb_vad_filter = gr.Checkbox(label=_("Enable Silero VAD Filter"), value=vad_params["vad_filter"],
170
  interactive=True,
171
+ info=_("Enable this to transcribe only detected voice"))
172
  sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
173
  value=vad_params["threshold"],
174
  info="Lower it to be more sensitive to small sounds.")
 
185
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
186
  info="Final speech chunks are padded by this time each side")
187
 
188
+ with gr.Accordion(_("Diarization"), open=False):
189
+ cb_diarize = gr.Checkbox(label=_("Enable Diarization"), value=diarization_params["is_diarize"])
190
+ tb_hf_token = gr.Text(label=_("HuggingFace Token"), value=diarization_params["hf_token"],
191
+ info=_("This is only needed the first time you download the model"))
192
+ dd_diarization_device = gr.Dropdown(label=_("Device"),
 
 
 
 
193
  choices=self.whisper_inf.diarizer.get_available_device(),
194
  value=self.whisper_inf.diarizer.get_device())
195
 
 
229
  uvr_params = self.default_params["bgm_separation"]
230
 
231
  with self.app:
232
+ with self.i18n:
233
+ with gr.Row():
 
 
 
234
  with gr.Column():
235
+ gr.Markdown(MARKDOWN, elem_id="md_project")
236
+ with gr.Tabs():
237
+ with gr.TabItem(_("File")): # tab1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  with gr.Column():
239
+ input_file = gr.Files(type="filepath", label=_("Upload File here"))
240
+ tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
241
+ info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
242
+ " Leave this field empty if you do not wish to use a local path.",
243
+ visible=self.args.colab,
244
+ value="")
245
 
246
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
247
 
248
+ with gr.Row():
249
+ btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
250
+ with gr.Row():
251
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
252
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3, interactive=False)
253
+ btn_openfolder = gr.Button('📂', scale=1)
254
 
255
+ params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
256
+ btn_run.click(fn=self.whisper_inf.transcribe_file,
257
+ inputs=params + whisper_params.as_list(),
258
+ outputs=[tb_indicator, files_subtitles])
259
+ btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
260
 
261
+ with gr.TabItem(_("Youtube")): # tab2
262
+ with gr.Row():
263
+ tb_youtubelink = gr.Textbox(label=_("Youtube Link"))
264
+ with gr.Row(equal_height=True):
265
+ with gr.Column():
266
+ img_thumbnail = gr.Image(label=_("Youtube Thumbnail"))
267
+ with gr.Column():
268
+ tb_title = gr.Label(label=_("Youtube Title"))
269
+ tb_description = gr.Textbox(label=_("Youtube Description"), max_lines=15)
270
 
271
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
 
 
272
 
273
+ with gr.Row():
274
+ btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
275
+ with gr.Row():
276
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
277
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
278
+ btn_openfolder = gr.Button('📂', scale=1)
279
 
280
+ params = [tb_youtubelink, dd_file_format, cb_timestamp]
 
 
 
 
 
281
 
282
+ btn_run.click(fn=self.whisper_inf.transcribe_youtube,
283
+ inputs=params + whisper_params.as_list(),
284
+ outputs=[tb_indicator, files_subtitles])
285
+ tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
286
+ outputs=[img_thumbnail, tb_title, tb_description])
287
+ btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
288
 
289
+ with gr.TabItem(_("Mic")): # tab3
290
+ with gr.Row():
291
+ mic_input = gr.Microphone(label=_("Record with Mic"), type="filepath", interactive=True)
 
292
 
293
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
 
 
294
 
 
 
 
 
 
 
 
 
295
  with gr.Row():
296
+ btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
297
  with gr.Row():
298
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
299
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
 
 
 
 
 
300
  btn_openfolder = gr.Button('📂', scale=1)
301
 
302
+ params = [mic_input, dd_file_format, cb_timestamp]
 
 
 
303
 
304
+ btn_run.click(fn=self.whisper_inf.transcribe_mic,
305
+ inputs=params + whisper_params.as_list(),
306
+ outputs=[tb_indicator, files_subtitles])
307
+ btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
308
 
309
+ with gr.TabItem(_("T2T Translation")): # tab 4
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  with gr.Row():
311
+ file_subs = gr.Files(type="filepath", label=_("Upload Subtitle Files to translate here"))
312
+
313
+ with gr.TabItem(_("DeepL API")): # sub tab1
314
+ with gr.Row():
315
+ tb_api_key = gr.Textbox(label=_("Your Auth Key (API KEY)"),
316
+ value=deepl_params["api_key"])
317
+ with gr.Row():
318
+ dd_source_lang = gr.Dropdown(label=_("Source Language"),
319
+ value=AUTOMATIC_DETECTION if deepl_params["source_lang"] == AUTOMATIC_DETECTION.unwrap()
320
+ else deepl_params["source_lang"],
321
+ choices=list(self.deepl_api.available_source_langs.keys()))
322
+ dd_target_lang = gr.Dropdown(label=_("Target Language"),
323
+ value=deepl_params["target_lang"],
324
+ choices=list(self.deepl_api.available_target_langs.keys()))
325
+ with gr.Row():
326
+ cb_is_pro = gr.Checkbox(label=_("Pro User?"), value=deepl_params["is_pro"])
327
+ with gr.Row():
328
+ cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
329
+ label=_("Add a timestamp to the end of the filename"),
330
+ interactive=True)
331
+ with gr.Row():
332
+ btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
333
+ with gr.Row():
334
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
335
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
336
+ btn_openfolder = gr.Button('📂', scale=1)
337
+
338
+ btn_run.click(fn=self.deepl_api.translate_deepl,
339
+ inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
340
+ cb_is_pro, cb_timestamp],
341
+ outputs=[tb_indicator, files_subtitles])
342
+
343
+ btn_openfolder.click(
344
+ fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
345
+ inputs=None,
346
+ outputs=None)
347
+
348
+ with gr.TabItem(_("NLLB")): # sub tab2
349
+ with gr.Row():
350
+ dd_model_size = gr.Dropdown(label=_("Model"), value=nllb_params["model_size"],
351
+ choices=self.nllb_inf.available_models)
352
+ dd_source_lang = gr.Dropdown(label=_("Source Language"),
353
+ value=nllb_params["source_lang"],
354
+ choices=self.nllb_inf.available_source_langs)
355
+ dd_target_lang = gr.Dropdown(label=_("Target Language"),
356
+ value=nllb_params["target_lang"],
357
+ choices=self.nllb_inf.available_target_langs)
358
+ with gr.Row():
359
+ nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
360
+ precision=0)
361
+ with gr.Row():
362
+ cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
363
+ label=_("Add a timestamp to the end of the filename"),
364
+ interactive=True)
365
+ with gr.Row():
366
+ btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
367
+ with gr.Row():
368
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
369
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
370
+ btn_openfolder = gr.Button('📂', scale=1)
371
+ with gr.Column():
372
+ md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
373
+
374
+ btn_run.click(fn=self.nllb_inf.translate_file,
375
+ inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
376
+ nb_max_length, cb_timestamp],
377
+ outputs=[tb_indicator, files_subtitles])
378
+
379
+ btn_openfolder.click(
380
+ fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
381
+ inputs=None,
382
+ outputs=None)
383
+
384
+ with gr.TabItem(_("BGM Separation")):
385
+ files_audio = gr.Files(type="filepath", label=_("Upload Audio Files to separate background music"))
386
+ dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
387
+ choices=self.whisper_inf.music_separator.available_devices)
388
+ dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"],
389
+ choices=self.whisper_inf.music_separator.available_models)
390
+ nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"],
391
+ precision=0)
392
+ cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"),
393
+ value=True, visible=False)
394
+ btn_run = gr.Button(_("SEPARATE BACKGROUND MUSIC"), variant="primary")
395
  with gr.Column():
396
+ with gr.Row():
397
+ ad_instrumental = gr.Audio(label=_("Instrumental"), scale=8)
398
+ btn_open_instrumental_folder = gr.Button('📂', scale=1)
399
+ with gr.Row():
400
+ ad_vocals = gr.Audio(label=_("Vocals"), scale=8)
401
+ btn_open_vocals_folder = gr.Button('📂', scale=1)
402
+
403
+ btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
404
+ inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
405
+ cb_uvr_save_file],
406
+ outputs=[ad_instrumental, ad_vocals])
407
+ btn_open_instrumental_folder.click(inputs=None,
408
+ outputs=None,
409
+ fn=lambda: self.open_folder(os.path.join(
410
+ self.args.output_dir, "UVR", "instrumental"
411
+ )))
412
+ btn_open_vocals_folder.click(inputs=None,
413
+ outputs=None,
414
+ fn=lambda: self.open_folder(os.path.join(
415
+ self.args.output_dir, "UVR", "vocals"
416
+ )))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
  # Launch the app with optional gradio settings
419
  args = self.args
 
446
  return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
447
 
448
 
 
449
  parser = argparse.ArgumentParser()
450
  parser.add_argument('--whisper_type', type=str, default="faster-whisper",
451
+ choices=["whisper", "faster-whisper", "insanely-fast-whisper"],
452
+ help='A type of the whisper implementation (Github repo name)')
453
  parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
454
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
455
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
 
458
  parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
459
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
460
  parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
461
+ parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True,
462
+ help='Enable api or not in Gradio')
463
+ parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True,
464
+ help='Whether to automatically start Gradio app or not')
465
  parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
466
  help='Directory path of the whisper model')
467
  parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
configs/translation.yaml ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ en: # English
2
+ Language: Language
3
+ File: File
4
+ Youtube: Youtube
5
+ Mic: Mic
6
+ T2T Translation: T2T Translation
7
+ BGM Separation: BGM Separation
8
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
9
+ Output: Output
10
+ Downloadable output file: Downloadable output file
11
+ Upload File here: Upload File here
12
+ Model: Model
13
+ Automatic Detection: Automatic Detection
14
+ File Format: File Format
15
+ Translate to English?: Translate to English?
16
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
17
+ Advanced Parameters: Advanced Parameters
18
+ Background Music Remover Filter: Background Music Remover Filter
19
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
20
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
21
+ Save separated files to output: Save separated files to output
22
+ Offload sub model after removing background music: Offload sub model after removing background music
23
+ Voice Detection Filter: Voice Detection Filter
24
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
25
+ Enable Silero VAD Filter: Enable Silero VAD Filter
26
+ Diarization: Diarization
27
+ Enable Diarization: Enable Diarization
28
+ HuggingFace Token: HuggingFace Token
29
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
30
+ Device: Device
31
+ Youtube Link: Youtube Link
32
+ Youtube Thumbnail: Youtube Thumbnail
33
+ Youtube Title: Youtube Title
34
+ Youtube Description: Youtube Description
35
+ Record with Mic: Record with Mic
36
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
37
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
38
+ Source Language: Source Language
39
+ Target Language: Target Language
40
+ Pro User?: Pro User?
41
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
42
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
43
+ Instrumental: Instrumental
44
+ Vocals: Vocals
45
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
46
+
47
+ ko: # Korean
48
+ Language: 언어
49
+ File: 파일
50
+ Youtube: 유튜브
51
+ Mic: 마이크
52
+ T2T Translation: T2T 자막 번역
53
+ BGM Separation: 배경 음악 분리
54
+ GENERATE SUBTITLE FILE: 자막 파일 생성
55
+ Output: 결과물
56
+ Downloadable output file: 결과물 파일 다운로드
57
+ Upload File here: 파일을 업로드 하세요
58
+ Model: 모델
59
+ Automatic Detection: 자동 감지
60
+ File Format: 파일 형식
61
+ Translate to English?: 영어로 번역합니까? (위스퍼 모델 자체 번역 기능)
62
+ Add a timestamp to the end of the filename: 파일 이름 끝에 타임스태프 붙이기
63
+ Advanced Parameters: 고급 변수
64
+ Background Music Remover Filter: 배경 음악 제거 필터
65
+ Enabling this will remove background music: 받아쓰기 이전에 먼저 배경 음악 제거용 서브 모델을 활성화 합니다.
66
+ Enable Background Music Remover Filter: 배경 음악 제거 필터 활성화
67
+ Save separated files to output: 분리된 배경 음악 & 음성 파일 따로 출력 폴더에 저장
68
+ Offload sub model after removing background music: 배경 음악 제거 후 서브 모델을 비활성화 합니다. (VRAM 이 부족할 시 체크하세요.)
69
+ Voice Detection Filter: 목소리 감지 필터
70
+ Enable this to transcribe only detected voice: 서브 모델에 의해 목소리라고 판단된 부분만 받아쓰기를 진행합니다.
71
+ Enable Silero VAD Filter: Silero VAD 필터 활성화
72
+ Diarization: 화자 구분
73
+ Enable Diarization: 화자 구분 활성화
74
+ HuggingFace Token: 허깅페이스 토큰
75
+ This is only needed the first time you download the model: 모델을 처음 다운받을 때만 토큰이 필요합니다. 이미 다운로드 받으신 상태라면 입력하지 않아도 됩니다. 모델을 다운 받기 위해선 "https://huggingface.co/pyannote/speaker-diarization-3.1" 와 "https://huggingface.co/pyannote/segmentation-3.0" 에서 먼저 사용 지침에 동의하셔야 합니다.
76
+ Device: 디바이스
77
+ Youtube Link: 유튜브 링크
78
+ Youtube Thumbnail: 유튜브 썸네일
79
+ Youtube Title: 유튜브 제목
80
+ Youtube Description: 유튜브 설명
81
+ Record with Mic: 마이크로 녹음하세요
82
+ Upload Subtitle Files to translate here: 번역할 자막 파일을 업로드 하세요
83
+ Your Auth Key (API KEY): DeepL API 키
84
+ Source Language: 원본 언어
85
+ Target Language: 대상 언어
86
+ Pro User?: Pro 버전 사용자
87
+ TRANSLATE SUBTITLE FILE: 자막 파일 번역
88
+ Upload Audio Files to separate background music: 배경 음악을 분리할 오디오 파일을 업로드 하세요
89
+ Instrumental: 악기
90
+ Vocals: 보컬
91
+ SEPARATE BACKGROUND MUSIC: 배경 음악 분리
92
+
93
+ ja: # Japanese
94
+ Language: 言語
95
+ File: File
96
+ Youtube: Youtube
97
+ Mic: Mic
98
+ T2T Translation: T2T Translation
99
+ BGM Separation: BGM Separation
100
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
101
+ Output: Output
102
+ Downloadable output file: Downloadable output file
103
+ Upload File here: Upload File here
104
+ Model: Model
105
+ Automatic Detection: Automatic Detection
106
+ File Format: File Format
107
+ Translate to English?: Translate to English?
108
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
109
+ Advanced Parameters: Advanced Parameters
110
+ Background Music Remover Filter: Background Music Remover Filter
111
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
112
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
113
+ Save separated files to output: Save separated files to output
114
+ Offload sub model after removing background music: Offload sub model after removing background music
115
+ Voice Detection Filter: Voice Detection Filter
116
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
117
+ Enable Silero VAD Filter: Enable Silero VAD Filter
118
+ Diarization: Diarization
119
+ Enable Diarization: Enable Diarization
120
+ HuggingFace Token: HuggingFace Token
121
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
122
+ Device: Device
123
+ Youtube Link: Youtube Link
124
+ Youtube Thumbnail: Youtube Thumbnail
125
+ Youtube Title: Youtube Title
126
+ Youtube Description: Youtube Description
127
+ Record with Mic: Record with Mic
128
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
129
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
130
+ Source Language: Source Language
131
+ Target Language: Target Language
132
+ Pro User?: Pro User?
133
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
134
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
135
+ Instrumental: Instrumental
136
+ Vocals: Vocals
137
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
138
+
139
+ es: # Spanish
140
+ Language: Idioma
141
+ File: File
142
+ Youtube: Youtube
143
+ Mic: Mic
144
+ T2T Translation: T2T Translation
145
+ BGM Separation: BGM Separation
146
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
147
+ Output: Output
148
+ Downloadable output file: Downloadable output file
149
+ Upload File here: Upload File here
150
+ Model: Model
151
+ Automatic Detection: Automatic Detection
152
+ File Format: File Format
153
+ Translate to English?: Translate to English?
154
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
155
+ Advanced Parameters: Advanced Parameters
156
+ Background Music Remover Filter: Background Music Remover Filter
157
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
158
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
159
+ Save separated files to output: Save separated files to output
160
+ Offload sub model after removing background music: Offload sub model after removing background music
161
+ Voice Detection Filter: Voice Detection Filter
162
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
163
+ Enable Silero VAD Filter: Enable Silero VAD Filter
164
+ Diarization: Diarization
165
+ Enable Diarization: Enable Diarization
166
+ HuggingFace Token: HuggingFace Token
167
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
168
+ Device: Device
169
+ Youtube Link: Youtube Link
170
+ Youtube Thumbnail: Youtube Thumbnail
171
+ Youtube Title: Youtube Title
172
+ Youtube Description: Youtube Description
173
+ Record with Mic: Record with Mic
174
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
175
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
176
+ Source Language: Source Language
177
+ Target Language: Target Language
178
+ Pro User?: Pro User?
179
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
180
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
181
+ Instrumental: Instrumental
182
+ Vocals: Vocals
183
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
184
+
185
+ fr: # French
186
+ Language: Langue
187
+ File: File
188
+ Youtube: Youtube
189
+ Mic: Mic
190
+ T2T Translation: T2T Translation
191
+ BGM Separation: BGM Separation
192
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
193
+ Output: Output
194
+ Downloadable output file: Downloadable output file
195
+ Upload File here: Upload File here
196
+ Model: Model
197
+ Automatic Detection: Automatic Detection
198
+ File Format: File Format
199
+ Translate to English?: Translate to English?
200
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
201
+ Advanced Parameters: Advanced Parameters
202
+ Background Music Remover Filter: Background Music Remover Filter
203
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
204
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
205
+ Save separated files to output: Save separated files to output
206
+ Offload sub model after removing background music: Offload sub model after removing background music
207
+ Voice Detection Filter: Voice Detection Filter
208
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
209
+ Enable Silero VAD Filter: Enable Silero VAD Filter
210
+ Diarization: Diarization
211
+ Enable Diarization: Enable Diarization
212
+ HuggingFace Token: HuggingFace Token
213
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
214
+ Device: Device
215
+ Youtube Link: Youtube Link
216
+ Youtube Thumbnail: Youtube Thumbnail
217
+ Youtube Title: Youtube Title
218
+ Youtube Description: Youtube Description
219
+ Record with Mic: Record with Mic
220
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
221
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
222
+ Source Language: Source Language
223
+ Target Language: Target Language
224
+ Pro User?: Pro User?
225
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
226
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
227
+ Instrumental: Instrumental
228
+ Vocals: Vocals
229
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
230
+
231
+ de: # German
232
+ Language: Sprache
233
+ File: File
234
+ Youtube: Youtube
235
+ Mic: Mic
236
+ T2T Translation: T2T Translation
237
+ BGM Separation: BGM Separation
238
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
239
+ Output: Output
240
+ Downloadable output file: Downloadable output file
241
+ Upload File here: Upload File here
242
+ Model: Model
243
+ Automatic Detection: Automatic Detection
244
+ File Format: File Format
245
+ Translate to English?: Translate to English?
246
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
247
+ Advanced Parameters: Advanced Parameters
248
+ Background Music Remover Filter: Background Music Remover Filter
249
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
250
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
251
+ Save separated files to output: Save separated files to output
252
+ Offload sub model after removing background music: Offload sub model after removing background music
253
+ Voice Detection Filter: Voice Detection Filter
254
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
255
+ Enable Silero VAD Filter: Enable Silero VAD Filter
256
+ Diarization: Diarization
257
+ Enable Diarization: Enable Diarization
258
+ HuggingFace Token: HuggingFace Token
259
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
260
+ Device: Device
261
+ Youtube Link: Youtube Link
262
+ Youtube Thumbnail: Youtube Thumbnail
263
+ Youtube Title: Youtube Title
264
+ Youtube Description: Youtube Description
265
+ Record with Mic: Record with Mic
266
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
267
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
268
+ Source Language: Source Language
269
+ Target Language: Target Language
270
+ Pro User?: Pro User?
271
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
272
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
273
+ Instrumental: Instrumental
274
+ Vocals: Vocals
275
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
276
+
277
+ zh: # Chinese
278
+ Language: 语言
279
+ File: File
280
+ Youtube: Youtube
281
+ Mic: Mic
282
+ T2T Translation: T2T Translation
283
+ BGM Separation: BGM Separation
284
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
285
+ Output: Output
286
+ Downloadable output file: Downloadable output file
287
+ Upload File here: Upload File here
288
+ Model: Model
289
+ Automatic Detection: Automatic Detection
290
+ File Format: File Format
291
+ Translate to English?: Translate to English?
292
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
293
+ Advanced Parameters: Advanced Parameters
294
+ Background Music Remover Filter: Background Music Remover Filter
295
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
296
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
297
+ Save separated files to output: Save separated files to output
298
+ Offload sub model after removing background music: Offload sub model after removing background music
299
+ Voice Detection Filter: Voice Detection Filter
300
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
301
+ Enable Silero VAD Filter: Enable Silero VAD Filter
302
+ Diarization: Diarization
303
+ Enable Diarization: Enable Diarization
304
+ HuggingFace Token: HuggingFace Token
305
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
306
+ Device: Device
307
+ Youtube Link: Youtube Link
308
+ Youtube Thumbnail: Youtube Thumbnail
309
+ Youtube Title: Youtube Title
310
+ Youtube Description: Youtube Description
311
+ Record with Mic: Record with Mic
312
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
313
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
314
+ Source Language: Source Language
315
+ Target Language: Target Language
316
+ Pro User?: Pro User?
317
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
318
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
319
+ Instrumental: Instrumental
320
+ Vocals: Vocals
321
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
modules/translation/deepl_api.py CHANGED
@@ -5,6 +5,7 @@ from datetime import datetime
5
  import gradio as gr
6
 
7
  from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
 
8
  from modules.utils.subtitle_manager import *
9
  from modules.utils.files_manager import load_yaml, save_yaml
10
 
@@ -50,7 +51,7 @@ DEEPL_AVAILABLE_TARGET_LANGS = {
50
  }
51
 
52
  DEEPL_AVAILABLE_SOURCE_LANGS = {
53
- 'Automatic Detection': None,
54
  'Bulgarian': 'BG',
55
  'Czech': 'CS',
56
  'Danish': 'DA',
 
5
  import gradio as gr
6
 
7
  from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
8
+ from modules.utils.constants import AUTOMATIC_DETECTION
9
  from modules.utils.subtitle_manager import *
10
  from modules.utils.files_manager import load_yaml, save_yaml
11
 
 
51
  }
52
 
53
  DEEPL_AVAILABLE_SOURCE_LANGS = {
54
+ AUTOMATIC_DETECTION: None,
55
  'Bulgarian': 'BG',
56
  'Czech': 'CS',
57
  'Danish': 'DA',
modules/utils/constants.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from gradio_i18n import Translate, gettext as _
2
+
3
+ AUTOMATIC_DETECTION = _("Automatic Detection")
modules/utils/paths.py CHANGED
@@ -10,6 +10,7 @@ DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
10
  UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
11
  CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
12
  DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
 
13
  OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
14
  TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
15
  UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
 
10
  UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
11
  CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
12
  DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
13
+ I18N_YAML_PATH = os.path.join(CONFIGS_DIR, "translation.yaml")
14
  OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
15
  TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
16
  UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
modules/whisper/whisper_base.py CHANGED
@@ -14,6 +14,7 @@ from dataclasses import astuple
14
  from modules.uvr.music_separator import MusicSeparator
15
  from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
16
  UVR_MODELS_DIR)
 
17
  from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
18
  from modules.utils.youtube_manager import get_ytdata, get_ytaudio
19
  from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
@@ -107,7 +108,7 @@ class WhisperBase(ABC):
107
 
108
  if params.lang is None:
109
  pass
110
- elif params.lang == "Automatic Detection":
111
  params.lang = None
112
  else:
113
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
 
14
  from modules.uvr.music_separator import MusicSeparator
15
  from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
16
  UVR_MODELS_DIR)
17
+ from modules.utils.constants import AUTOMATIC_DETECTION
18
  from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
19
  from modules.utils.youtube_manager import get_ytdata, get_ytaudio
20
  from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
 
108
 
109
  if params.lang is None:
110
  pass
111
+ elif params.lang == AUTOMATIC_DETECTION:
112
  params.lang = None
113
  else:
114
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
modules/whisper/whisper_parameter.py CHANGED
@@ -3,6 +3,8 @@ import gradio as gr
3
  from typing import Optional, Dict
4
  import yaml
5
 
 
 
6
 
7
  @dataclass
8
  class WhisperParameters:
@@ -306,7 +308,7 @@ class WhisperValues:
306
  data = {
307
  "whisper": {
308
  "model_size": self.model_size,
309
- "lang": "Automatic Detection" if self.lang is None else self.lang,
310
  "is_translate": self.is_translate,
311
  "beam_size": self.beam_size,
312
  "log_prob_threshold": self.log_prob_threshold,
 
3
  from typing import Optional, Dict
4
  import yaml
5
 
6
+ from modules.utils.constants import AUTOMATIC_DETECTION
7
+
8
 
9
  @dataclass
10
  class WhisperParameters:
 
308
  data = {
309
  "whisper": {
310
  "model_size": self.model_size,
311
+ "lang": AUTOMATIC_DETECTION.unwrap() if self.lang is None else self.lang,
312
  "is_translate": self.is_translate,
313
  "beam_size": self.beam_size,
314
  "log_prob_threshold": self.log_prob_threshold,
requirements.txt CHANGED
@@ -11,6 +11,7 @@ git+https://github.com/jhj0517/jhj0517-whisper.git
11
  faster-whisper==1.0.3
12
  transformers
13
  gradio
 
14
  pytubefix
15
  ruamel.yaml==0.18.6
16
  pyannote.audio==3.3.1
 
11
  faster-whisper==1.0.3
12
  transformers
13
  gradio
14
+ git+https://github.com/jhj0517/gradio-i18n.git@fix/encoding-error
15
  pytubefix
16
  ruamel.yaml==0.18.6
17
  pyannote.audio==3.3.1