Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Oct 20, 2024

Commit

ed6e918

unverified ·

2 Parent(s): 8dc115b 84fd983

Merge pull request #341 from jhj0517/feature/support-i18n

Browse files

Files changed (8) hide show

app.py +230 -200
configs/translation.yaml +321 -0
modules/translation/deepl_api.py +2 -1
modules/utils/constants.py +3 -0
modules/utils/paths.py +1 -0
modules/whisper/whisper_base.py +2 -1
modules/whisper/whisper_parameter.py +3 -1
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import os
 import argparse
 import gradio as gr
 import yaml
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
                                  INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
-                                 UVR_MODELS_DIR)
 from modules.utils.files_manager import load_yaml
 from modules.whisper.whisper_factory import WhisperFactory
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
@@ -22,6 +24,7 @@ class App:
     def __init__(self, args):
         self.args = args
         self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
         self.whisper_inf = WhisperFactory.create_whisper_inference(
             whisper_type=self.args.whisper_type,
             whisper_model_dir=self.args.whisper_model_dir,
@@ -38,8 +41,8 @@ class App:
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
         self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
-        print(f"Use \"{self.args.whisper_type}\" implementation")
-        print(f"Device \"{self.whisper_inf.device}\" is detected")
     def create_whisper_parameters(self):
         whisper_params = self.default_params["whisper"]
@@ -49,23 +52,28 @@ class App:
         with gr.Row():
             dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
-                                   label="Model")
-            dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
-                                  value=whisper_params["lang"], label="Language")
-            dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
         with gr.Row():
-            cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English?",
                                        interactive=True)
         with gr.Row():
-            cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename",
                                        interactive=True)
-        with gr.Accordion("Advanced Parameters", open=False):
-            nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
                                      info="Beam size to use for decoding.")
-            nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=whisper_params["log_prob_threshold"], interactive=True,
                                               info="If the average log probability over sampled tokens is below this value, treat as failed.")
-            nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], interactive=True,
                                                info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
             dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
                                           value=self.whisper_inf.current_compute_type, interactive=True,
@@ -75,10 +83,12 @@ class App:
                                    info="Number of candidates when sampling with non-zero temperature.")
             nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
                                     info="Beam search patience factor.")
-            cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=whisper_params["condition_on_previous_text"],
                                                         interactive=True,
                                                         info="Condition on previous text during decoding.")
-            sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=whisper_params["prompt_reset_on_temperature"],
                                                         minimum=0, maximum=1, step=0.01, interactive=True,
                                                         info="Resets prompt if temperature is above this value."
                                                              " Arg has effect only if 'Condition On Previous Text' is True.")
@@ -87,7 +97,8 @@ class App:
             sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
                                        step=0.01, maximum=1.0, interactive=True,
                                        info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
-            nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
                                                        interactive=True,
                                                        info="If the gzip compression ratio is above this value, treat as failed.")
             nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
@@ -96,9 +107,11 @@ class App:
             with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                 nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
                                               info="Exponential length penalty constant.")
-                nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"],
                                                   info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
-                nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"],
                                                     precision=0,
                                                     info="Prevent repetitions of n-grams with this size (set 0 to disable).")
                 tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
@@ -107,48 +120,55 @@ class App:
                                                 info="Suppress blank outputs at the beginning of the sampling.")
                 tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
                                                 info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
-                nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"],
                                                      info="The initial timestamp cannot be later than this.")
                 cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
                                                  info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
-                tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"],
                                                      info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
-                tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"],
                                                     info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
                 nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
                                               precision=0,
                                               info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
                 nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
-                                                               value=lambda: whisper_params["hallucination_silence_threshold"],
                                                                info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
                 tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
                                          info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
-                nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"],
                                                             info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
-                nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"],
                                                            precision=0,
                                                            info="Number of segments to consider for the language detection.")
             with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
-        with gr.Accordion("Background Music Remover Filter", open=False):
-            cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
                                             interactive=True,
-                                            info="Enabling this will remove background music by submodel before"
-                                                 " transcribing ")
-            dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
                                         choices=self.whisper_inf.music_separator.available_devices)
-            dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
                                             choices=self.whisper_inf.music_separator.available_models)
             nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
-            cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
-            cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
                                                 value=uvr_params["enable_offload"])
-        with gr.Accordion("Voice Detection Filter", open=False):
-            cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
                                         interactive=True,
-                                        info="Enable this to transcribe only detected voice parts by submodel.")
             sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
                                      value=vad_params["threshold"],
                                      info="Lower it to be more sensitive to small sounds.")
@@ -165,15 +185,11 @@ class App:
             nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
                                          info="Final speech chunks are padded by this time each side")
-        with gr.Accordion("Diarization", open=False):
-            cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
-            tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
-                                  info="This is only needed the first time you download the model. If you already have"
-                                       " models, you don't need to enter. To download the model, you must manually go "
-                                       "to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and "
-                                       "\"https://huggingface.co/pyannote/segmentation-3.0\" and agree to"
-                                       " their requirement.")
-            dd_diarization_device = gr.Dropdown(label="Device",
                                                 choices=self.whisper_inf.diarizer.get_available_device(),
                                                 value=self.whisper_inf.diarizer.get_device())
@@ -213,179 +229,191 @@ class App:
         uvr_params = self.default_params["bgm_separation"]
         with self.app:
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown(MARKDOWN, elem_id="md_project")
-            with gr.Tabs():
-                with gr.TabItem("File"):  # tab1
                     with gr.Column():
-                        input_file = gr.Files(type="filepath", label="Upload File here")
-                        tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
-                                                     info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
-                                                          " Leave this field empty if you do not wish to use a local path.",
-                                                     visible=self.args.colab,
-                                                     value="")
-                    whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
-                    with gr.Row():
-                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
-                    with gr.Row():
-                        tb_indicator = gr.Textbox(label="Output", scale=5)
-                        files_subtitles = gr.Files(label="Downloadable output file", scale=3, interactive=False)
-                        btn_openfolder = gr.Button('📂', scale=1)
-                    params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
-                    btn_run.click(fn=self.whisper_inf.transcribe_file,
-                                  inputs=params + whisper_params.as_list(),
-                                  outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
-                with gr.TabItem("Youtube"):  # tab2
-                    with gr.Row():
-                        tb_youtubelink = gr.Textbox(label="Youtube Link")
-                    with gr.Row(equal_height=True):
                         with gr.Column():
-                            img_thumbnail = gr.Image(label="Youtube Thumbnail")
-                        with gr.Column():
-                            tb_title = gr.Label(label="Youtube Title")
-                            tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
-                    whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
-                    with gr.Row():
-                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
-                    with gr.Row():
-                        tb_indicator = gr.Textbox(label="Output", scale=5)
-                        files_subtitles = gr.Files(label="Downloadable output file", scale=3)
-                        btn_openfolder = gr.Button('📂', scale=1)
-                    params = [tb_youtubelink, dd_file_format, cb_timestamp]
-                    btn_run.click(fn=self.whisper_inf.transcribe_youtube,
-                                  inputs=params + whisper_params.as_list(),
-                                  outputs=[tb_indicator, files_subtitles])
-                    tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
-                                          outputs=[img_thumbnail, tb_title, tb_description])
-                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
-                with gr.TabItem("Mic"):  # tab3
-                    with gr.Row():
-                        mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
-                    whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
-                    with gr.Row():
-                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
-                    with gr.Row():
-                        tb_indicator = gr.Textbox(label="Output", scale=5)
-                        files_subtitles = gr.Files(label="Downloadable output file", scale=3)
-                        btn_openfolder = gr.Button('📂', scale=1)
-                    params = [mic_input, dd_file_format, cb_timestamp]
-                    btn_run.click(fn=self.whisper_inf.transcribe_mic,
-                                  inputs=params + whisper_params.as_list(),
-                                  outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
-                with gr.TabItem("T2T Translation"):  # tab 4
-                    with gr.Row():
-                        file_subs = gr.Files(type="filepath", label="Upload Subtitle Files to translate here")
-                    with gr.TabItem("DeepL API"):  # sub tab1
-                        with gr.Row():
-                            tb_api_key = gr.Textbox(label="Your Auth Key (API KEY)", value=deepl_params["api_key"])
-                        with gr.Row():
-                            dd_source_lang = gr.Dropdown(label="Source Language", value=deepl_params["source_lang"],
-                                                         choices=list(self.deepl_api.available_source_langs.keys()))
-                            dd_target_lang = gr.Dropdown(label="Target Language", value=deepl_params["target_lang"],
-                                                         choices=list(self.deepl_api.available_target_langs.keys()))
                         with gr.Row():
-                            cb_is_pro = gr.Checkbox(label="Pro User?", value=deepl_params["is_pro"])
                         with gr.Row():
-                            cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
-                                                       interactive=True)
-                        with gr.Row():
-                            btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
-                        with gr.Row():
-                            tb_indicator = gr.Textbox(label="Output", scale=5)
-                            files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                             btn_openfolder = gr.Button('📂', scale=1)
-                    btn_run.click(fn=self.deepl_api.translate_deepl,
-                                  inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
-                                          cb_is_pro, cb_timestamp],
-                                  outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
-                                         inputs=None,
-                                         outputs=None)
-                    with gr.TabItem("NLLB"):  # sub tab2
-                        with gr.Row():
-                            dd_model_size = gr.Dropdown(label="Model", value=nllb_params["model_size"],
-                                                        choices=self.nllb_inf.available_models)
-                            dd_source_lang = gr.Dropdown(label="Source Language", value=nllb_params["source_lang"],
-                                                         choices=self.nllb_inf.available_source_langs)
-                            dd_target_lang = gr.Dropdown(label="Target Language", value=nllb_params["target_lang"],
-                                                         choices=self.nllb_inf.available_target_langs)
-                        with gr.Row():
-                            nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
-                                                      precision=0)
-                        with gr.Row():
-                            cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
-                                                       interactive=True)
                         with gr.Row():
-                            btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
-                        with gr.Row():
-                            tb_indicator = gr.Textbox(label="Output", scale=5)
-                            files_subtitles = gr.Files(label="Downloadable output file", scale=3)
-                            btn_openfolder = gr.Button('📂', scale=1)
                         with gr.Column():
-                            md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
-                    btn_run.click(fn=self.nllb_inf.translate_file,
-                                  inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
-                                          nb_max_length, cb_timestamp],
-                                  outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
-                                         inputs=None,
-                                         outputs=None)
-                with gr.TabItem("BGM Separation"):
-                    files_audio = gr.Files(type="filepath", label="Upload Audio Files to separate background music")
-                    dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
-                                                choices=self.whisper_inf.music_separator.available_devices)
-                    dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
-                                                    choices=self.whisper_inf.music_separator.available_models)
-                    nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
-                    cb_uvr_save_file = gr.Checkbox(label="Save separated files to output",
-                                                   value=True, visible=False)
-                    btn_run = gr.Button("SEPARATE BACKGROUND MUSIC", variant="primary")
-                    with gr.Column():
-                        with gr.Row():
-                            ad_instrumental = gr.Audio(label="Instrumental", scale=8)
-                            btn_open_instrumental_folder = gr.Button('📂', scale=1)
-                        with gr.Row():
-                            ad_vocals = gr.Audio(label="Vocals", scale=8)
-                            btn_open_vocals_folder = gr.Button('📂', scale=1)
-                    btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
-                                  inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
-                                          cb_uvr_save_file],
-                                  outputs=[ad_instrumental, ad_vocals])
-                    btn_open_instrumental_folder.click(inputs=None,
-                                                       outputs=None,
-                                                       fn=lambda: self.open_folder(os.path.join(
-                                                           self.args.output_dir, "UVR", "instrumental"
-                                                       )))
-                    btn_open_vocals_folder.click(inputs=None,
-                                                 outputs=None,
-                                                 fn=lambda: self.open_folder(os.path.join(
-                                                    self.args.output_dir, "UVR", "vocals"
-                                                 )))
         # Launch the app with optional gradio settings
         args = self.args
@@ -418,10 +446,10 @@ class App:
             return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
-# Create the parser for command-line arguments
 parser = argparse.ArgumentParser()
 parser.add_argument('--whisper_type', type=str, default="faster-whisper",
-                    help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
 parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
@@ -430,8 +458,10 @@ parser.add_argument('--username', type=str, default=None, help='Gradio authentic
 parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
 parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
 parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
-parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
-parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
 parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
                     help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,

 import os
 import argparse
 import gradio as gr
+from gradio_i18n import Translate, gettext as _
 import yaml
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
                                  INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+                                 UVR_MODELS_DIR, I18N_YAML_PATH)
+from modules.utils.constants import AUTOMATIC_DETECTION
 from modules.utils.files_manager import load_yaml
 from modules.whisper.whisper_factory import WhisperFactory
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
     def __init__(self, args):
         self.args = args
         self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
+        self.i18n = Translate(I18N_YAML_PATH)
         self.whisper_inf = WhisperFactory.create_whisper_inference(
             whisper_type=self.args.whisper_type,
             whisper_model_dir=self.args.whisper_model_dir,
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
         self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        print(f"Use \"{self.args.whisper_type}\" implementation\n"
+              f"Device \"{self.whisper_inf.device}\" is detected")
     def create_whisper_parameters(self):
         whisper_params = self.default_params["whisper"]
         with gr.Row():
             dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
+                                   label=_("Model"))
+            dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
+                                  value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
+                                  else whisper_params["lang"], label=_("Language"))
+            dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label=_("File Format"))
         with gr.Row():
+            cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
                                        interactive=True)
         with gr.Row():
+            cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"],
+                                       label=_("Add a timestamp to the end of the filename"),
                                        interactive=True)
+        with gr.Accordion(_("Advanced Parameters"), open=False):
+            nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0,
+                                     interactive=True,
                                      info="Beam size to use for decoding.")
+            nb_log_prob_threshold = gr.Number(label="Log Probability Threshold",
+                                              value=whisper_params["log_prob_threshold"], interactive=True,
                                               info="If the average log probability over sampled tokens is below this value, treat as failed.")
+            nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"],
+                                               interactive=True,
                                                info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
             dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
                                           value=self.whisper_inf.current_compute_type, interactive=True,
                                    info="Number of candidates when sampling with non-zero temperature.")
             nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
                                     info="Beam search patience factor.")
+            cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text",
+                                                        value=whisper_params["condition_on_previous_text"],
                                                         interactive=True,
                                                         info="Condition on previous text during decoding.")
+            sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature",
+                                                        value=whisper_params["prompt_reset_on_temperature"],
                                                         minimum=0, maximum=1, step=0.01, interactive=True,
                                                         info="Resets prompt if temperature is above this value."
                                                              " Arg has effect only if 'Condition On Previous Text' is True.")
             sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
                                        step=0.01, maximum=1.0, interactive=True,
                                        info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
+            nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold",
+                                                       value=whisper_params["compression_ratio_threshold"],
                                                        interactive=True,
                                                        info="If the gzip compression ratio is above this value, treat as failed.")
             nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
             with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                 nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
                                               info="Exponential length penalty constant.")
+                nb_repetition_penalty = gr.Number(label="Repetition Penalty",
+                                                  value=whisper_params["repetition_penalty"],
                                                   info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
+                nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size",
+                                                    value=whisper_params["no_repeat_ngram_size"],
                                                     precision=0,
                                                     info="Prevent repetitions of n-grams with this size (set 0 to disable).")
                 tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
                                                 info="Suppress blank outputs at the beginning of the sampling.")
                 tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
                                                 info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
+                nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp",
+                                                     value=whisper_params["max_initial_timestamp"],
                                                      info="The initial timestamp cannot be later than this.")
                 cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
                                                  info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
+                tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations",
+                                                     value=whisper_params["prepend_punctuations"],
                                                      info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
+                tb_append_punctuations = gr.Textbox(label="Append Punctuations",
+                                                    value=whisper_params["append_punctuations"],
                                                     info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
                 nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
                                               precision=0,
                                               info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
                 nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
+                                                               value=lambda: whisper_params[
+                                                                   "hallucination_silence_threshold"],
                                                                info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
                 tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
                                          info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
+                nb_language_detection_threshold = gr.Number(label="Language Detection Threshold",
+                                                            value=lambda: whisper_params[
+                                                                "language_detection_threshold"],
                                                             info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
+                nb_language_detection_segments = gr.Number(label="Language Detection Segments",
+                                                           value=lambda: whisper_params["language_detection_segments"],
                                                            precision=0,
                                                            info="Number of segments to consider for the language detection.")
             with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
+        with gr.Accordion(_("Background Music Remover Filter"), open=False):
+            cb_bgm_separation = gr.Checkbox(label=_("Enable Background Music Remover Filter"),
+                                            value=uvr_params["is_separate_bgm"],
                                             interactive=True,
+                                            info=_("Enabling this will remove background music"))
+            dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
                                         choices=self.whisper_inf.music_separator.available_devices)
+            dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"],
                                             choices=self.whisper_inf.music_separator.available_models)
             nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
+            cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"), value=uvr_params["save_file"])
+            cb_uvr_enable_offload = gr.Checkbox(label=_("Offload sub model after removing background music"),
                                                 value=uvr_params["enable_offload"])
+        with gr.Accordion(_("Voice Detection Filter"), open=False):
+            cb_vad_filter = gr.Checkbox(label=_("Enable Silero VAD Filter"), value=vad_params["vad_filter"],
                                         interactive=True,
+                                        info=_("Enable this to transcribe only detected voice"))
             sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
                                      value=vad_params["threshold"],
                                      info="Lower it to be more sensitive to small sounds.")
             nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
                                          info="Final speech chunks are padded by this time each side")
+        with gr.Accordion(_("Diarization"), open=False):
+            cb_diarize = gr.Checkbox(label=_("Enable Diarization"), value=diarization_params["is_diarize"])
+            tb_hf_token = gr.Text(label=_("HuggingFace Token"), value=diarization_params["hf_token"],
+                                  info=_("This is only needed the first time you download the model"))
+            dd_diarization_device = gr.Dropdown(label=_("Device"),
                                                 choices=self.whisper_inf.diarizer.get_available_device(),
                                                 value=self.whisper_inf.diarizer.get_device())
         uvr_params = self.default_params["bgm_separation"]
         with self.app:
+            with self.i18n:
+                with gr.Row():
                     with gr.Column():
+                        gr.Markdown(MARKDOWN, elem_id="md_project")
+                with gr.Tabs():
+                    with gr.TabItem(_("File")):  # tab1
                         with gr.Column():
+                            input_file = gr.Files(type="filepath", label=_("Upload File here"))
+                            tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
+                                                         info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
+                                                              " Leave this field empty if you do not wish to use a local path.",
+                                                         visible=self.args.colab,
+                                                         value="")
+                        whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3, interactive=False)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                      inputs=params + whisper_params.as_list(),
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("Youtube")):  # tab2
+                        with gr.Row():
+                            tb_youtubelink = gr.Textbox(label=_("Youtube Link"))
+                        with gr.Row(equal_height=True):
+                            with gr.Column():
+                                img_thumbnail = gr.Image(label=_("Youtube Thumbnail"))
+                            with gr.Column():
+                                tb_title = gr.Label(label=_("Youtube Title"))
+                                tb_description = gr.Textbox(label=_("Youtube Description"), max_lines=15)
+                        whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [tb_youtubelink, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                      inputs=params + whisper_params.as_list(),
+                                      outputs=[tb_indicator, files_subtitles])
+                        tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
+                                              outputs=[img_thumbnail, tb_title, tb_description])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("Mic")):  # tab3
+                        with gr.Row():
+                            mic_input = gr.Microphone(label=_("Record with Mic"), type="filepath", interactive=True)
+                        whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
                         with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
                         with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
                             btn_openfolder = gr.Button('📂', scale=1)
+                        params = [mic_input, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                      inputs=params + whisper_params.as_list(),
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("T2T Translation")):  # tab 4
                         with gr.Row():
+                            file_subs = gr.Files(type="filepath", label=_("Upload Subtitle Files to translate here"))
+                        with gr.TabItem(_("DeepL API")):  # sub tab1
+                            with gr.Row():
+                                tb_api_key = gr.Textbox(label=_("Your Auth Key (API KEY)"),
+                                                        value=deepl_params["api_key"])
+                            with gr.Row():
+                                dd_source_lang = gr.Dropdown(label=_("Source Language"),
+                                                             value=AUTOMATIC_DETECTION if deepl_params["source_lang"] == AUTOMATIC_DETECTION.unwrap()
+                                                             else deepl_params["source_lang"],
+                                                             choices=list(self.deepl_api.available_source_langs.keys()))
+                                dd_target_lang = gr.Dropdown(label=_("Target Language"),
+                                                             value=deepl_params["target_lang"],
+                                                             choices=list(self.deepl_api.available_target_langs.keys()))
+                            with gr.Row():
+                                cb_is_pro = gr.Checkbox(label=_("Pro User?"), value=deepl_params["is_pro"])
+                            with gr.Row():
+                                cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
+                                                           label=_("Add a timestamp to the end of the filename"),
+                                                           interactive=True)
+                            with gr.Row():
+                                btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
+                            with gr.Row():
+                                tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                                files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                                btn_openfolder = gr.Button('📂', scale=1)
+                        btn_run.click(fn=self.deepl_api.translate_deepl,
+                                      inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
+                                              cb_is_pro, cb_timestamp],
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
+                            inputs=None,
+                            outputs=None)
+                        with gr.TabItem(_("NLLB")):  # sub tab2
+                            with gr.Row():
+                                dd_model_size = gr.Dropdown(label=_("Model"), value=nllb_params["model_size"],
+                                                            choices=self.nllb_inf.available_models)
+                                dd_source_lang = gr.Dropdown(label=_("Source Language"),
+                                                             value=nllb_params["source_lang"],
+                                                             choices=self.nllb_inf.available_source_langs)
+                                dd_target_lang = gr.Dropdown(label=_("Target Language"),
+                                                             value=nllb_params["target_lang"],
+                                                             choices=self.nllb_inf.available_target_langs)
+                            with gr.Row():
+                                nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
+                                                          precision=0)
+                            with gr.Row():
+                                cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
+                                                           label=_("Add a timestamp to the end of the filename"),
+                                                           interactive=True)
+                            with gr.Row():
+                                btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
+                            with gr.Row():
+                                tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                                files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                                btn_openfolder = gr.Button('📂', scale=1)
+                            with gr.Column():
+                                md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
+                        btn_run.click(fn=self.nllb_inf.translate_file,
+                                      inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
+                                              nb_max_length, cb_timestamp],
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
+                            inputs=None,
+                            outputs=None)
+                    with gr.TabItem(_("BGM Separation")):
+                        files_audio = gr.Files(type="filepath", label=_("Upload Audio Files to separate background music"))
+                        dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
+                                                    choices=self.whisper_inf.music_separator.available_devices)
+                        dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"],
+                                                        choices=self.whisper_inf.music_separator.available_models)
+                        nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"],
+                                                        precision=0)
+                        cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"),
+                                                       value=True, visible=False)
+                        btn_run = gr.Button(_("SEPARATE BACKGROUND MUSIC"), variant="primary")
                         with gr.Column():
+                            with gr.Row():
+                                ad_instrumental = gr.Audio(label=_("Instrumental"), scale=8)
+                                btn_open_instrumental_folder = gr.Button('📂', scale=1)
+                            with gr.Row():
+                                ad_vocals = gr.Audio(label=_("Vocals"), scale=8)
+                                btn_open_vocals_folder = gr.Button('📂', scale=1)
+                        btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
+                                      inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
+                                              cb_uvr_save_file],
+                                      outputs=[ad_instrumental, ad_vocals])
+                        btn_open_instrumental_folder.click(inputs=None,
+                                                           outputs=None,
+                                                           fn=lambda: self.open_folder(os.path.join(
+                                                               self.args.output_dir, "UVR", "instrumental"
+                                                           )))
+                        btn_open_vocals_folder.click(inputs=None,
+                                                     outputs=None,
+                                                     fn=lambda: self.open_folder(os.path.join(
+                                                         self.args.output_dir, "UVR", "vocals"
+                                                     )))
         # Launch the app with optional gradio settings
         args = self.args
             return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
 parser = argparse.ArgumentParser()
 parser.add_argument('--whisper_type', type=str, default="faster-whisper",
+                    choices=["whisper", "faster-whisper", "insanely-fast-whisper"],
+                    help='A type of the whisper implementation (Github repo name)')
 parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
 parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
 parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
 parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
+parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True,
+                    help='Enable api or not in Gradio')
+parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True,
+                    help='Whether to automatically start Gradio app or not')
 parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
                     help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,

configs/translation.yaml ADDED Viewed

	@@ -0,0 +1,321 @@

+en: # English
+  Language: Language
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+ko: # Korean
+  Language: 언어
+  File: 파일
+  Youtube: 유튜브
+  Mic: 마이크
+  T2T Translation: T2T 자막 번역
+  BGM Separation: 배경 음악 분리
+  GENERATE SUBTITLE FILE: 자막 파일 생성
+  Output: 결과물
+  Downloadable output file: 결과물 파일 다운로드
+  Upload File here: 파일을 업로드 하세요
+  Model: 모델
+  Automatic Detection: 자동 감지
+  File Format: 파일 형식
+  Translate to English?: 영어로 번역합니까? (위스퍼 모델 자체 번역 기능)
+  Add a timestamp to the end of the filename: 파일 이름 끝에 타임스태프 붙이기
+  Advanced Parameters: 고급 변수
+  Background Music Remover Filter: 배경 음악 제거 필터
+  Enabling this will remove background music: 받아쓰기 이전에 먼저 배경 음악 제거용 서브 모델을 활성화 합니다.
+  Enable Background Music Remover Filter: 배경 음악 제거 필터 활성화
+  Save separated files to output: 분리된 배경 음악 & 음성 파일 따로 출력 폴더에 저장
+  Offload sub model after removing background music: 배경 음악 제거 후 서브 모델을 비활성화 합니다. (VRAM 이 부족할 시 체크하세요.)
+  Voice Detection Filter: 목소리 감지 필터
+  Enable this to transcribe only detected voice: 서브 모델에 의해 목소리라고 판단된 부분만 받아쓰기를 진행합니다.
+  Enable Silero VAD Filter: Silero VAD 필터 활성화
+  Diarization: 화자 구분
+  Enable Diarization: 화자 구분 활성화
+  HuggingFace Token: 허깅페이스 토큰
+  This is only needed the first time you download the model: 모델을 처음 다운받을 때만 토큰이 필요합니다. 이미 다운로드 받으신 상태라면 입력하지 않아도 됩니다. 모델을 다운 받기 위해선 "https://huggingface.co/pyannote/speaker-diarization-3.1" 와 "https://huggingface.co/pyannote/segmentation-3.0" 에서 먼저 사용 지침에 동의하셔야 합니다.
+  Device: 디바이스
+  Youtube Link: 유튜브 링크
+  Youtube Thumbnail: 유튜브 썸네일
+  Youtube Title: 유튜브 제목
+  Youtube Description: 유튜브 설명
+  Record with Mic: 마이크로 녹음하세요
+  Upload Subtitle Files to translate here: 번역할 자막 파일을 업로드 하세요
+  Your Auth Key (API KEY): DeepL API 키
+  Source Language: 원본 언어
+  Target Language: 대상 언어
+  Pro User?: Pro 버전 사용자
+  TRANSLATE SUBTITLE FILE: 자막 파일 번역
+  Upload Audio Files to separate background music: 배경 음악을 분리할 오디오 파일을 업로드 하세요
+  Instrumental: 악기
+  Vocals: 보컬
+  SEPARATE BACKGROUND MUSIC: 배경 음악 분리
+ja: # Japanese
+  Language: 言語
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+es: # Spanish
+  Language: Idioma
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+fr: # French
+  Language: Langue
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+de: # German
+  Language: Sprache
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+zh: # Chinese
+  Language: 语言
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC

modules/translation/deepl_api.py CHANGED Viewed

@@ -5,6 +5,7 @@ from datetime import datetime
 import gradio as gr
 from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
 from modules.utils.subtitle_manager import *
 from modules.utils.files_manager import load_yaml, save_yaml
@@ -50,7 +51,7 @@ DEEPL_AVAILABLE_TARGET_LANGS = {
 }
 DEEPL_AVAILABLE_SOURCE_LANGS = {
-    'Automatic Detection': None,
     'Bulgarian': 'BG',
     'Czech': 'CS',
     'Danish': 'DA',

 import gradio as gr
 from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
+from modules.utils.constants import AUTOMATIC_DETECTION
 from modules.utils.subtitle_manager import *
 from modules.utils.files_manager import load_yaml, save_yaml
 }
 DEEPL_AVAILABLE_SOURCE_LANGS = {
+    AUTOMATIC_DETECTION: None,
     'Bulgarian': 'BG',
     'Czech': 'CS',
     'Danish': 'DA',

modules/utils/constants.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from gradio_i18n import Translate, gettext as _
2	+
3	+ AUTOMATIC_DETECTION = _("Automatic Detection")

modules/utils/paths.py CHANGED Viewed

@@ -10,6 +10,7 @@ DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
 UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
 CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
 DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
 OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
 TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
 UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")

 UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
 CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
 DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
+I18N_YAML_PATH = os.path.join(CONFIGS_DIR, "translation.yaml")
 OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
 TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
 UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")

modules/whisper/whisper_base.py CHANGED Viewed

@@ -14,6 +14,7 @@ from dataclasses import astuple
 from modules.uvr.music_separator import MusicSeparator
 from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
                                  UVR_MODELS_DIR)
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
 from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
@@ -107,7 +108,7 @@ class WhisperBase(ABC):
         if params.lang is None:
             pass
-        elif params.lang == "Automatic Detection":
             params.lang = None
         else:
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}

 from modules.uvr.music_separator import MusicSeparator
 from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
                                  UVR_MODELS_DIR)
+from modules.utils.constants import AUTOMATIC_DETECTION
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
 from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
         if params.lang is None:
             pass
+        elif params.lang == AUTOMATIC_DETECTION:
             params.lang = None
         else:
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}

modules/whisper/whisper_parameter.py CHANGED Viewed

@@ -3,6 +3,8 @@ import gradio as gr
 from typing import Optional, Dict
 import yaml
 @dataclass
 class WhisperParameters:
@@ -306,7 +308,7 @@ class WhisperValues:
         data = {
             "whisper": {
                 "model_size": self.model_size,
-                "lang": "Automatic Detection" if self.lang is None else self.lang,
                 "is_translate": self.is_translate,
                 "beam_size": self.beam_size,
                 "log_prob_threshold": self.log_prob_threshold,

 from typing import Optional, Dict
 import yaml
+from modules.utils.constants import AUTOMATIC_DETECTION
 @dataclass
 class WhisperParameters:
         data = {
             "whisper": {
                 "model_size": self.model_size,
+                "lang": AUTOMATIC_DETECTION.unwrap() if self.lang is None else self.lang,
                 "is_translate": self.is_translate,
                 "beam_size": self.beam_size,
                 "log_prob_threshold": self.log_prob_threshold,

requirements.txt CHANGED Viewed

@@ -11,6 +11,7 @@ git+https://github.com/jhj0517/jhj0517-whisper.git
 faster-whisper==1.0.3
 transformers
 gradio
 pytubefix
 ruamel.yaml==0.18.6
 pyannote.audio==3.3.1

 faster-whisper==1.0.3
 transformers
 gradio
+git+https://github.com/jhj0517/gradio-i18n.git@fix/encoding-error
 pytubefix
 ruamel.yaml==0.18.6
 pyannote.audio==3.3.1