jhj0517 commited on
Commit
60fd426
2 Parent(s): 451ca33 1189737

Merge branch 'master' into huggingface

Browse files
.dockerignore CHANGED
@@ -1,10 +1,10 @@
1
  # from .gitignore
2
- venv/
3
- ui/__pycache__/
4
- outputs/
5
- modules/__pycache__/
6
- models/
7
  modules/yt_tmp.wav
 
 
 
 
8
 
9
- .git
10
- .github
 
 
1
  # from .gitignore
 
 
 
 
 
2
  modules/yt_tmp.wav
3
+ **/venv/
4
+ **/__pycache__/
5
+ **/outputs/
6
+ **/models/
7
 
8
+ **/.idea
9
+ **/.git
10
+ **/.github
.github/pull_request_template.md CHANGED
@@ -1,5 +1,5 @@
1
- ## Related issues
2
- - #0
3
 
4
- ## Changed
5
- 1. Changes
 
1
+ ## Related issues / PRs
2
+ - #
3
 
4
+ ## Summarize Changes
5
+ 1.
.github/workflows/ci-shell.yml CHANGED
@@ -6,9 +6,11 @@ on:
6
  push:
7
  branches:
8
  - master
 
9
  pull_request:
10
  branches:
11
  - master
 
12
 
13
  jobs:
14
  test-shell-script:
@@ -16,7 +18,7 @@ jobs:
16
  runs-on: ubuntu-latest
17
  strategy:
18
  matrix:
19
- python: [ "3.10" ]
20
 
21
  steps:
22
  - name: Clean up space for action
 
6
  push:
7
  branches:
8
  - master
9
+ - intel-gpu
10
  pull_request:
11
  branches:
12
  - master
13
+ - intel-gpu
14
 
15
  jobs:
16
  test-shell-script:
 
18
  runs-on: ubuntu-latest
19
  strategy:
20
  matrix:
21
+ python: ["3.10", "3.11", "3.12"]
22
 
23
  steps:
24
  - name: Clean up space for action
.github/workflows/ci.yml CHANGED
@@ -6,9 +6,11 @@ on:
6
  push:
7
  branches:
8
  - master
 
9
  pull_request:
10
  branches:
11
  - master
 
12
 
13
  jobs:
14
  build:
@@ -16,7 +18,7 @@ jobs:
16
  runs-on: ubuntu-latest
17
  strategy:
18
  matrix:
19
- python: ["3.10"]
20
 
21
  env:
22
  DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
@@ -35,7 +37,7 @@ jobs:
35
  run: sudo apt-get update && sudo apt-get install -y git ffmpeg
36
 
37
  - name: Install dependencies
38
- run: pip install -r requirements.txt pytest
39
 
40
  - name: Run test
41
  run: python -m pytest -rs tests
 
6
  push:
7
  branches:
8
  - master
9
+ - intel-gpu
10
  pull_request:
11
  branches:
12
  - master
13
+ - intel-gpu
14
 
15
  jobs:
16
  build:
 
18
  runs-on: ubuntu-latest
19
  strategy:
20
  matrix:
21
+ python: ["3.10", "3.11", "3.12"]
22
 
23
  env:
24
  DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
 
37
  run: sudo apt-get update && sudo apt-get install -y git ffmpeg
38
 
39
  - name: Install dependencies
40
+ run: pip install -r requirements.txt pytest jiwer
41
 
42
  - name: Run test
43
  run: python -m pytest -rs tests
.gitignore CHANGED
@@ -10,4 +10,5 @@ outputs/
10
  modules/__pycache__/
11
  models/
12
  modules/yt_tmp.wav
13
- configs/default_parameters.yaml
 
 
10
  modules/__pycache__/
11
  models/
12
  modules/yt_tmp.wav
13
+ configs/default_parameters.yaml
14
+ __pycache__/
Install.bat CHANGED
@@ -8,6 +8,7 @@ echo checked the venv folder. now installing requirements..
8
 
9
  call "%~dp0\venv\scripts\activate"
10
 
 
11
  pip install -r requirements.txt
12
 
13
  if errorlevel 1 (
 
8
 
9
  call "%~dp0\venv\scripts\activate"
10
 
11
+ python -m pip install -U pip
12
  pip install -r requirements.txt
13
 
14
  if errorlevel 1 (
Install.sh CHANGED
@@ -7,6 +7,7 @@ fi
7
 
8
  source venv/bin/activate
9
 
 
10
  pip install -r requirements.txt && echo "Requirements installed successfully." || {
11
  echo ""
12
  echo "Requirements installation failed. Please remove the venv folder and run the script again."
 
7
 
8
  source venv/bin/activate
9
 
10
+ python -m pip install -U pip
11
  pip install -r requirements.txt && echo "Requirements installed successfully." || {
12
  echo ""
13
  echo "Requirements installation failed. Please remove the venv folder and run the script again."
app.py CHANGED
@@ -1,27 +1,27 @@
1
  import os
2
  import argparse
3
  import gradio as gr
 
4
  import yaml
5
 
6
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
7
  INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
8
- UVR_MODELS_DIR)
9
  from modules.utils.files_manager import load_yaml
10
  from modules.whisper.whisper_factory import WhisperFactory
11
- from modules.whisper.faster_whisper_inference import FasterWhisperInference
12
- from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
13
  from modules.translation.nllb_inference import NLLBInference
14
  from modules.ui.htmls import *
15
  from modules.utils.cli_manager import str2bool
16
  from modules.utils.youtube_manager import get_ytmetas
17
  from modules.translation.deepl_api import DeepLAPI
18
- from modules.whisper.whisper_parameter import *
19
 
20
 
21
  class App:
22
  def __init__(self, args):
23
  self.args = args
24
  self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
 
25
  self.whisper_inf = WhisperFactory.create_whisper_inference(
26
  whisper_type=self.args.whisper_type,
27
  whisper_model_dir=self.args.whisper_model_dir,
@@ -38,10 +38,10 @@ class App:
38
  output_dir=os.path.join(self.args.output_dir, "translations")
39
  )
40
  self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
41
- print(f"Use \"{self.args.whisper_type}\" implementation")
42
- print(f"Device \"{self.whisper_inf.device}\" is detected")
43
 
44
- def create_whisper_parameters(self):
45
  whisper_params = self.default_params["whisper"]
46
  vad_params = self.default_params["vad"]
47
  diarization_params = self.default_params["diarization"]
@@ -49,158 +49,45 @@ class App:
49
 
50
  with gr.Row():
51
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
52
- label="Model")
53
- dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
54
- value=whisper_params["lang"], label="Language")
55
- dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
 
56
  with gr.Row():
57
- cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English?",
58
  interactive=True)
59
  with gr.Row():
60
- cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename",
 
61
  interactive=True)
62
 
63
- with gr.Accordion("Advanced Parameters", open=False):
64
- nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
65
- info="Beam size to use for decoding.")
66
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=whisper_params["log_prob_threshold"], interactive=True,
67
- info="If the average log probability over sampled tokens is below this value, treat as failed.")
68
- nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], interactive=True,
69
- info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
70
- dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
71
- value=self.whisper_inf.current_compute_type, interactive=True,
72
- allow_custom_value=True,
73
- info="Select the type of computation to perform.")
74
- nb_best_of = gr.Number(label="Best Of", value=whisper_params["best_of"], interactive=True,
75
- info="Number of candidates when sampling with non-zero temperature.")
76
- nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
77
- info="Beam search patience factor.")
78
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=whisper_params["condition_on_previous_text"],
79
- interactive=True,
80
- info="Condition on previous text during decoding.")
81
- sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=whisper_params["prompt_reset_on_temperature"],
82
- minimum=0, maximum=1, step=0.01, interactive=True,
83
- info="Resets prompt if temperature is above this value."
84
- " Arg has effect only if 'Condition On Previous Text' is True.")
85
- tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
86
- info="Initial prompt to use for decoding.")
87
- sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
88
- step=0.01, maximum=1.0, interactive=True,
89
- info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
90
- nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
91
- interactive=True,
92
- info="If the gzip compression ratio is above this value, treat as failed.")
93
- nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
94
- precision=0,
95
- info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
96
- with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
97
- nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
98
- info="Exponential length penalty constant.")
99
- nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"],
100
- info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
101
- nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"],
102
- precision=0,
103
- info="Prevent repetitions of n-grams with this size (set 0 to disable).")
104
- tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
105
- info="Optional text to provide as a prefix for the first window.")
106
- cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=whisper_params["suppress_blank"],
107
- info="Suppress blank outputs at the beginning of the sampling.")
108
- tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
109
- info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
110
- nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"],
111
- info="The initial timestamp cannot be later than this.")
112
- cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
113
- info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
114
- tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"],
115
- info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
116
- tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"],
117
- info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
118
- nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
119
- precision=0,
120
- info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
121
- nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
122
- value=lambda: whisper_params["hallucination_silence_threshold"],
123
- info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
124
- tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
125
- info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
126
- nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"],
127
- info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
128
- nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"],
129
- precision=0,
130
- info="Number of segments to consider for the language detection.")
131
- with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
132
- nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
133
-
134
- with gr.Accordion("Background Music Remover Filter", open=False):
135
- cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
136
- interactive=True,
137
- info="Enabling this will remove background music by submodel before"
138
- " transcribing ")
139
- dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
140
- choices=self.whisper_inf.music_separator.available_devices)
141
- dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
142
- choices=self.whisper_inf.music_separator.available_models)
143
- nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
144
- cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
145
- cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
146
- value=uvr_params["enable_offload"])
147
-
148
- with gr.Accordion("Voice Detection Filter", open=False):
149
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
150
- interactive=True,
151
- info="Enable this to transcribe only detected voice parts by submodel.")
152
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
153
- value=vad_params["threshold"],
154
- info="Lower it to be more sensitive to small sounds.")
155
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
156
- value=vad_params["min_speech_duration_ms"],
157
- info="Final speech chunks shorter than this time are thrown out")
158
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
159
- value=vad_params["max_speech_duration_s"],
160
- info="Maximum duration of speech chunks in \"seconds\".")
161
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
162
- value=vad_params["min_silence_duration_ms"],
163
- info="In the end of each speech chunk wait for this time"
164
- " before separating it")
165
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
166
- info="Final speech chunks are padded by this time each side")
167
-
168
- with gr.Accordion("Diarization", open=False):
169
- cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
170
- tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
171
- info="This is only needed the first time you download the model. If you already have"
172
- " models, you don't need to enter. To download the model, you must manually go "
173
- "to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to"
174
- " their requirement.")
175
- dd_diarization_device = gr.Dropdown(label="Device",
176
- choices=self.whisper_inf.diarizer.get_available_device(),
177
- value=self.whisper_inf.diarizer.get_device())
178
 
179
  dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
180
 
 
 
181
  return (
182
- WhisperParameters(
183
- model_size=dd_model, lang=dd_lang, is_translate=cb_translate, beam_size=nb_beam_size,
184
- log_prob_threshold=nb_log_prob_threshold, no_speech_threshold=nb_no_speech_threshold,
185
- compute_type=dd_compute_type, best_of=nb_best_of, patience=nb_patience,
186
- condition_on_previous_text=cb_condition_on_previous_text, initial_prompt=tb_initial_prompt,
187
- temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
188
- vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
189
- max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
190
- speech_pad_ms=nb_speech_pad_ms, chunk_length=nb_chunk_length, batch_size=nb_batch_size,
191
- is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
192
- length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
193
- no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
194
- suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
195
- word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
196
- append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens,
197
- hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
198
- language_detection_threshold=nb_language_detection_threshold,
199
- language_detection_segments=nb_language_detection_segments,
200
- prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
201
- uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
202
- uvr_save_file=cb_uvr_save_file, uvr_enable_offload=cb_uvr_enable_offload
203
- ),
204
  dd_file_format,
205
  cb_timestamp
206
  )
@@ -212,185 +99,194 @@ class App:
212
  uvr_params = self.default_params["bgm_separation"]
213
 
214
  with self.app:
215
- with gr.Row():
216
- with gr.Column():
217
- gr.Markdown(MARKDOWN, elem_id="md_project")
218
- with gr.Tabs():
219
- with gr.TabItem("File"): # tab1
220
  with gr.Column():
221
- input_file = gr.Files(type="filepath", label="Upload File here")
222
- tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
223
- info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
224
- " Leave this field empty if you do not wish to use a local path.",
225
- visible=self.args.colab,
226
- value="")
227
-
228
- whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
229
-
230
- with gr.Row():
231
- btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
232
- with gr.Row():
233
- tb_indicator = gr.Textbox(label="Output", scale=5)
234
- files_subtitles = gr.Files(label="Downloadable output file", scale=3, interactive=False)
235
- btn_openfolder = gr.Button('📂', scale=1)
236
-
237
- params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
238
- btn_run.click(fn=self.whisper_inf.transcribe_file,
239
- inputs=params + whisper_params.as_list(),
240
- outputs=[tb_indicator, files_subtitles])
241
- btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
242
-
243
- with gr.TabItem("Youtube"): # tab2
244
- with gr.Row():
245
- tb_youtubelink = gr.Textbox(label="Youtube Link")
246
- with gr.Row(equal_height=True):
247
  with gr.Column():
248
- img_thumbnail = gr.Image(label="Youtube Thumbnail")
249
- with gr.Column():
250
- tb_title = gr.Label(label="Youtube Title")
251
- tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
 
 
252
 
253
- whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
254
 
255
- with gr.Row():
256
- btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
257
- with gr.Row():
258
- tb_indicator = gr.Textbox(label="Output", scale=5)
259
- files_subtitles = gr.Files(label="Downloadable output file", scale=3)
260
- btn_openfolder = gr.Button('📂', scale=1)
261
 
262
- params = [tb_youtubelink, dd_file_format, cb_timestamp]
 
 
 
 
263
 
264
- btn_run.click(fn=self.whisper_inf.transcribe_youtube,
265
- inputs=params + whisper_params.as_list(),
266
- outputs=[tb_indicator, files_subtitles])
267
- tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
268
- outputs=[img_thumbnail, tb_title, tb_description])
269
- btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
 
 
 
270
 
271
- with gr.TabItem("Mic"): # tab3
272
- with gr.Row():
273
- mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
274
 
275
- whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
 
 
 
 
 
276
 
277
- with gr.Row():
278
- btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
279
- with gr.Row():
280
- tb_indicator = gr.Textbox(label="Output", scale=5)
281
- files_subtitles = gr.Files(label="Downloadable output file", scale=3)
282
- btn_openfolder = gr.Button('📂', scale=1)
283
 
284
- params = [mic_input, dd_file_format, cb_timestamp]
 
 
 
 
 
285
 
286
- btn_run.click(fn=self.whisper_inf.transcribe_mic,
287
- inputs=params + whisper_params.as_list(),
288
- outputs=[tb_indicator, files_subtitles])
289
- btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
290
 
291
- with gr.TabItem("T2T Translation"): # tab 4
292
- with gr.Row():
293
- file_subs = gr.Files(type="filepath", label="Upload Subtitle Files to translate here",
294
- file_types=['.vtt', '.srt'])
295
 
296
- with gr.TabItem("DeepL API"): # sub tab1
297
  with gr.Row():
298
- tb_api_key = gr.Textbox(label="Your Auth Key (API KEY)", value=deepl_params["api_key"])
299
  with gr.Row():
300
- dd_source_lang = gr.Dropdown(label="Source Language", value=deepl_params["source_lang"],
301
- choices=list(
302
- self.deepl_api.available_source_langs.keys()))
303
- dd_target_lang = gr.Dropdown(label="Target Language", value=deepl_params["target_lang"],
304
- choices=list(self.deepl_api.available_target_langs.keys()))
305
- with gr.Row():
306
- cb_is_pro = gr.Checkbox(label="Pro User?", value=deepl_params["is_pro"])
307
- with gr.Row():
308
- cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
309
- interactive=True)
310
- with gr.Row():
311
- btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
312
- with gr.Row():
313
- tb_indicator = gr.Textbox(label="Output", scale=5)
314
- files_subtitles = gr.Files(label="Downloadable output file", scale=3)
315
  btn_openfolder = gr.Button('📂', scale=1)
316
 
317
- btn_run.click(fn=self.deepl_api.translate_deepl,
318
- inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
319
- cb_is_pro, cb_timestamp],
320
- outputs=[tb_indicator, files_subtitles])
321
 
322
- btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
323
- inputs=None,
324
- outputs=None)
 
325
 
326
- with gr.TabItem("NLLB"): # sub tab2
327
- with gr.Row():
328
- dd_model_size = gr.Dropdown(label="Model", value=nllb_params["model_size"],
329
- choices=self.nllb_inf.available_models)
330
- dd_source_lang = gr.Dropdown(label="Source Language", value=nllb_params["source_lang"],
331
- choices=self.nllb_inf.available_source_langs)
332
- dd_target_lang = gr.Dropdown(label="Target Language", value=nllb_params["target_lang"],
333
- choices=self.nllb_inf.available_target_langs)
334
- with gr.Row():
335
- nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
336
- precision=0)
337
- with gr.Row():
338
- cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
339
- interactive=True)
340
  with gr.Row():
341
- btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
342
- with gr.Row():
343
- tb_indicator = gr.Textbox(label="Output", scale=5)
344
- files_subtitles = gr.Files(label="Downloadable output file", scale=3)
345
- btn_openfolder = gr.Button('📂', scale=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  with gr.Column():
347
- md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
348
-
349
- btn_run.click(fn=self.nllb_inf.translate_file,
350
- inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
351
- nb_max_length, cb_timestamp],
352
- outputs=[tb_indicator, files_subtitles])
353
-
354
- btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
355
- inputs=None,
356
- outputs=None)
357
-
358
- with gr.TabItem("BGM Separation"):
359
- files_audio = gr.Files(type="filepath", label="Upload Audio Files to separate background music")
360
- dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
361
- choices=self.whisper_inf.music_separator.available_devices)
362
- dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
363
- choices=self.whisper_inf.music_separator.available_models)
364
- nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
365
- cb_uvr_save_file = gr.Checkbox(label="Save separated files to output",
366
- value=True, visible=False)
367
- btn_run = gr.Button("SEPARATE BACKGROUND MUSIC", variant="primary")
368
- with gr.Column():
369
- with gr.Row():
370
- ad_instrumental = gr.Audio(label="Instrumental", scale=8)
371
- btn_open_instrumental_folder = gr.Button('📂', scale=1)
372
- with gr.Row():
373
- ad_vocals = gr.Audio(label="Vocals", scale=8)
374
- btn_open_vocals_folder = gr.Button('📂', scale=1)
375
-
376
- btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
377
- inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
378
- cb_uvr_save_file],
379
- outputs=[ad_instrumental, ad_vocals])
380
- btn_open_instrumental_folder.click(inputs=None,
381
- outputs=None,
382
- fn=lambda: self.open_folder(os.path.join(
383
- self.args.output_dir, "UVR", "instrumental"
384
- )))
385
- btn_open_vocals_folder.click(inputs=None,
386
- outputs=None,
387
- fn=lambda: self.open_folder(os.path.join(
388
- self.args.output_dir, "UVR", "vocals"
389
- )))
390
 
391
  # Launch the app with optional gradio settings
392
  args = self.args
393
-
394
  self.app.queue(
395
  api_open=args.api_open
396
  ).launch(
@@ -419,10 +315,10 @@ class App:
419
  return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
420
 
421
 
422
- # Create the parser for command-line arguments
423
  parser = argparse.ArgumentParser()
424
- parser.add_argument('--whisper_type', type=str, default="faster-whisper",
425
- help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
 
426
  parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
427
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
428
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
@@ -431,8 +327,10 @@ parser.add_argument('--username', type=str, default=None, help='Gradio authentic
431
  parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
432
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
433
  parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
434
- parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
435
- parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
 
 
436
  parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
437
  help='Directory path of the whisper model')
438
  parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
 
1
  import os
2
  import argparse
3
  import gradio as gr
4
+ from gradio_i18n import Translate, gettext as _
5
  import yaml
6
 
7
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
8
  INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
9
+ UVR_MODELS_DIR, I18N_YAML_PATH)
10
  from modules.utils.files_manager import load_yaml
11
  from modules.whisper.whisper_factory import WhisperFactory
 
 
12
  from modules.translation.nllb_inference import NLLBInference
13
  from modules.ui.htmls import *
14
  from modules.utils.cli_manager import str2bool
15
  from modules.utils.youtube_manager import get_ytmetas
16
  from modules.translation.deepl_api import DeepLAPI
17
+ from modules.whisper.data_classes import *
18
 
19
 
20
  class App:
21
  def __init__(self, args):
22
  self.args = args
23
  self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
24
+ self.i18n = Translate(I18N_YAML_PATH)
25
  self.whisper_inf = WhisperFactory.create_whisper_inference(
26
  whisper_type=self.args.whisper_type,
27
  whisper_model_dir=self.args.whisper_model_dir,
 
38
  output_dir=os.path.join(self.args.output_dir, "translations")
39
  )
40
  self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
41
+ print(f"Use \"{self.args.whisper_type}\" implementation\n"
42
+ f"Device \"{self.whisper_inf.device}\" is detected")
43
 
44
+ def create_pipeline_inputs(self):
45
  whisper_params = self.default_params["whisper"]
46
  vad_params = self.default_params["vad"]
47
  diarization_params = self.default_params["diarization"]
 
49
 
50
  with gr.Row():
51
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
52
+ label=_("Model"))
53
+ dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
54
+ value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
55
+ else whisper_params["lang"], label=_("Language"))
56
+ dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt", "LRC"], value=whisper_params["file_format"], label=_("File Format"))
57
  with gr.Row():
58
+ cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
59
  interactive=True)
60
  with gr.Row():
61
+ cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"],
62
+ label=_("Add a timestamp to the end of the filename"),
63
  interactive=True)
64
 
65
+ with gr.Accordion(_("Advanced Parameters"), open=False):
66
+ whisper_inputs = WhisperParams.to_gradio_inputs(defaults=whisper_params, only_advanced=True,
67
+ whisper_type=self.args.whisper_type,
68
+ available_compute_types=self.whisper_inf.available_compute_types,
69
+ compute_type=self.whisper_inf.current_compute_type)
70
+
71
+ with gr.Accordion(_("Background Music Remover Filter"), open=False):
72
+ uvr_inputs = BGMSeparationParams.to_gradio_input(defaults=uvr_params,
73
+ available_models=self.whisper_inf.music_separator.available_models,
74
+ available_devices=self.whisper_inf.music_separator.available_devices,
75
+ device=self.whisper_inf.music_separator.device)
76
+
77
+ with gr.Accordion(_("Voice Detection Filter"), open=False):
78
+ vad_inputs = VadParams.to_gradio_inputs(defaults=vad_params)
79
+
80
+ with gr.Accordion(_("Diarization"), open=False):
81
+ diarization_inputs = DiarizationParams.to_gradio_inputs(defaults=diarization_params,
82
+ available_devices=self.whisper_inf.diarizer.available_device,
83
+ device=self.whisper_inf.diarizer.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
86
 
87
+ pipeline_inputs = [dd_model, dd_lang, cb_translate] + whisper_inputs + vad_inputs + diarization_inputs + uvr_inputs
88
+
89
  return (
90
+ pipeline_inputs,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  dd_file_format,
92
  cb_timestamp
93
  )
 
99
  uvr_params = self.default_params["bgm_separation"]
100
 
101
  with self.app:
102
+ with self.i18n:
103
+ with gr.Row():
 
 
 
104
  with gr.Column():
105
+ gr.Markdown(MARKDOWN, elem_id="md_project")
106
+ with gr.Tabs():
107
+ with gr.TabItem(_("File")): # tab1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  with gr.Column():
109
+ input_file = gr.Files(type="filepath", label=_("Upload File here"))
110
+ tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
111
+ info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
112
+ " Leave this field empty if you do not wish to use a local path.",
113
+ visible=self.args.colab,
114
+ value="")
115
 
116
+ pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
117
 
118
+ with gr.Row():
119
+ btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
120
+ with gr.Row():
121
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
122
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3, interactive=False)
123
+ btn_openfolder = gr.Button('📂', scale=1)
124
 
125
+ params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
126
+ btn_run.click(fn=self.whisper_inf.transcribe_file,
127
+ inputs=params + pipeline_params,
128
+ outputs=[tb_indicator, files_subtitles])
129
+ btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
130
 
131
+ with gr.TabItem(_("Youtube")): # tab2
132
+ with gr.Row():
133
+ tb_youtubelink = gr.Textbox(label=_("Youtube Link"))
134
+ with gr.Row(equal_height=True):
135
+ with gr.Column():
136
+ img_thumbnail = gr.Image(label=_("Youtube Thumbnail"))
137
+ with gr.Column():
138
+ tb_title = gr.Label(label=_("Youtube Title"))
139
+ tb_description = gr.Textbox(label=_("Youtube Description"), max_lines=15)
140
 
141
+ pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
 
 
142
 
143
+ with gr.Row():
144
+ btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
145
+ with gr.Row():
146
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
147
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
148
+ btn_openfolder = gr.Button('📂', scale=1)
149
 
150
+ params = [tb_youtubelink, dd_file_format, cb_timestamp]
 
 
 
 
 
151
 
152
+ btn_run.click(fn=self.whisper_inf.transcribe_youtube,
153
+ inputs=params + pipeline_params,
154
+ outputs=[tb_indicator, files_subtitles])
155
+ tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
156
+ outputs=[img_thumbnail, tb_title, tb_description])
157
+ btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
158
 
159
+ with gr.TabItem(_("Mic")): # tab3
160
+ with gr.Row():
161
+ mic_input = gr.Microphone(label=_("Record with Mic"), type="filepath", interactive=True)
 
162
 
163
+ pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
 
 
 
164
 
 
165
  with gr.Row():
166
+ btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
167
  with gr.Row():
168
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
169
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  btn_openfolder = gr.Button('📂', scale=1)
171
 
172
+ params = [mic_input, dd_file_format, cb_timestamp]
 
 
 
173
 
174
+ btn_run.click(fn=self.whisper_inf.transcribe_mic,
175
+ inputs=params + pipeline_params,
176
+ outputs=[tb_indicator, files_subtitles])
177
+ btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
178
 
179
+ with gr.TabItem(_("T2T Translation")): # tab 4
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  with gr.Row():
181
+ file_subs = gr.Files(type="filepath", label=_("Upload Subtitle Files to translate here"))
182
+
183
+ with gr.TabItem(_("DeepL API")): # sub tab1
184
+ with gr.Row():
185
+ tb_api_key = gr.Textbox(label=_("Your Auth Key (API KEY)"),
186
+ value=deepl_params["api_key"])
187
+ with gr.Row():
188
+ dd_source_lang = gr.Dropdown(label=_("Source Language"),
189
+ value=AUTOMATIC_DETECTION if deepl_params["source_lang"] == AUTOMATIC_DETECTION.unwrap()
190
+ else deepl_params["source_lang"],
191
+ choices=list(self.deepl_api.available_source_langs.keys()))
192
+ dd_target_lang = gr.Dropdown(label=_("Target Language"),
193
+ value=deepl_params["target_lang"],
194
+ choices=list(self.deepl_api.available_target_langs.keys()))
195
+ with gr.Row():
196
+ cb_is_pro = gr.Checkbox(label=_("Pro User?"), value=deepl_params["is_pro"])
197
+ with gr.Row():
198
+ cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
199
+ label=_("Add a timestamp to the end of the filename"),
200
+ interactive=True)
201
+ with gr.Row():
202
+ btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
203
+ with gr.Row():
204
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
205
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
206
+ btn_openfolder = gr.Button('📂', scale=1)
207
+
208
+ btn_run.click(fn=self.deepl_api.translate_deepl,
209
+ inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
210
+ cb_is_pro, cb_timestamp],
211
+ outputs=[tb_indicator, files_subtitles])
212
+
213
+ btn_openfolder.click(
214
+ fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
215
+ inputs=None,
216
+ outputs=None)
217
+
218
+ with gr.TabItem(_("NLLB")): # sub tab2
219
+ with gr.Row():
220
+ dd_model_size = gr.Dropdown(label=_("Model"), value=nllb_params["model_size"],
221
+ choices=self.nllb_inf.available_models)
222
+ dd_source_lang = gr.Dropdown(label=_("Source Language"),
223
+ value=nllb_params["source_lang"],
224
+ choices=self.nllb_inf.available_source_langs)
225
+ dd_target_lang = gr.Dropdown(label=_("Target Language"),
226
+ value=nllb_params["target_lang"],
227
+ choices=self.nllb_inf.available_target_langs)
228
+ with gr.Row():
229
+ nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
230
+ precision=0)
231
+ with gr.Row():
232
+ cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
233
+ label=_("Add a timestamp to the end of the filename"),
234
+ interactive=True)
235
+ with gr.Row():
236
+ btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
237
+ with gr.Row():
238
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
239
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
240
+ btn_openfolder = gr.Button('📂', scale=1)
241
+ with gr.Column():
242
+ md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
243
+
244
+ btn_run.click(fn=self.nllb_inf.translate_file,
245
+ inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
246
+ nb_max_length, cb_timestamp],
247
+ outputs=[tb_indicator, files_subtitles])
248
+
249
+ btn_openfolder.click(
250
+ fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
251
+ inputs=None,
252
+ outputs=None)
253
+
254
+ with gr.TabItem(_("BGM Separation")):
255
+ files_audio = gr.Files(type="filepath", label=_("Upload Audio Files to separate background music"))
256
+ dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
257
+ choices=self.whisper_inf.music_separator.available_devices)
258
+ dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"],
259
+ choices=self.whisper_inf.music_separator.available_models)
260
+ nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"],
261
+ precision=0)
262
+ cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"),
263
+ value=True, visible=False)
264
+ btn_run = gr.Button(_("SEPARATE BACKGROUND MUSIC"), variant="primary")
265
  with gr.Column():
266
+ with gr.Row():
267
+ ad_instrumental = gr.Audio(label=_("Instrumental"), scale=8)
268
+ btn_open_instrumental_folder = gr.Button('📂', scale=1)
269
+ with gr.Row():
270
+ ad_vocals = gr.Audio(label=_("Vocals"), scale=8)
271
+ btn_open_vocals_folder = gr.Button('📂', scale=1)
272
+
273
+ btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
274
+ inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
275
+ cb_uvr_save_file],
276
+ outputs=[ad_instrumental, ad_vocals])
277
+ btn_open_instrumental_folder.click(inputs=None,
278
+ outputs=None,
279
+ fn=lambda: self.open_folder(os.path.join(
280
+ self.args.output_dir, "UVR", "instrumental"
281
+ )))
282
+ btn_open_vocals_folder.click(inputs=None,
283
+ outputs=None,
284
+ fn=lambda: self.open_folder(os.path.join(
285
+ self.args.output_dir, "UVR", "vocals"
286
+ )))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  # Launch the app with optional gradio settings
289
  args = self.args
 
290
  self.app.queue(
291
  api_open=args.api_open
292
  ).launch(
 
315
  return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
316
 
317
 
 
318
  parser = argparse.ArgumentParser()
319
+ parser.add_argument('--whisper_type', type=str, default=WhisperImpl.FASTER_WHISPER.value,
320
+ choices=[item.value for item in WhisperImpl],
321
+ help='A type of the whisper implementation (Github repo name)')
322
  parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
323
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
324
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
 
327
  parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
328
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
329
  parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
330
+ parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True,
331
+ help='Enable api or not in Gradio')
332
+ parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True,
333
+ help='Whether to automatically start Gradio app or not')
334
  parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
335
  help='Directory path of the whisper model')
336
  parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
configs/default_parameters.yaml CHANGED
@@ -1,5 +1,6 @@
1
  whisper:
2
  model_size: "large-v2"
 
3
  lang: "Automatic Detection"
4
  is_translate: false
5
  beam_size: 5
 
1
  whisper:
2
  model_size: "large-v2"
3
+ file_format: "SRT"
4
  lang: "Automatic Detection"
5
  is_translate: false
6
  beam_size: 5
configs/translation.yaml ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ en: # English
2
+ Language: Language
3
+ File: File
4
+ Youtube: Youtube
5
+ Mic: Mic
6
+ T2T Translation: T2T Translation
7
+ BGM Separation: BGM Separation
8
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
9
+ Output: Output
10
+ Downloadable output file: Downloadable output file
11
+ Upload File here: Upload File here
12
+ Model: Model
13
+ Automatic Detection: Automatic Detection
14
+ File Format: File Format
15
+ Translate to English?: Translate to English?
16
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
17
+ Advanced Parameters: Advanced Parameters
18
+ Background Music Remover Filter: Background Music Remover Filter
19
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
20
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
21
+ Save separated files to output: Save separated files to output
22
+ Offload sub model after removing background music: Offload sub model after removing background music
23
+ Voice Detection Filter: Voice Detection Filter
24
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
25
+ Enable Silero VAD Filter: Enable Silero VAD Filter
26
+ Diarization: Diarization
27
+ Enable Diarization: Enable Diarization
28
+ HuggingFace Token: HuggingFace Token
29
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
30
+ Device: Device
31
+ Youtube Link: Youtube Link
32
+ Youtube Thumbnail: Youtube Thumbnail
33
+ Youtube Title: Youtube Title
34
+ Youtube Description: Youtube Description
35
+ Record with Mic: Record with Mic
36
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
37
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
38
+ Source Language: Source Language
39
+ Target Language: Target Language
40
+ Pro User?: Pro User?
41
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
42
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
43
+ Instrumental: Instrumental
44
+ Vocals: Vocals
45
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
46
+
47
+ ko: # Korean
48
+ Language: 언어
49
+ File: 파일
50
+ Youtube: 유튜브
51
+ Mic: 마이크
52
+ T2T Translation: T2T 자막 번역
53
+ BGM Separation: 배경 음악 분리
54
+ GENERATE SUBTITLE FILE: 자막 파일 생성
55
+ Output: 결과물
56
+ Downloadable output file: 결과물 파일 다운로드
57
+ Upload File here: 파일을 업로드 하세요
58
+ Model: 모델
59
+ Automatic Detection: 자동 감지
60
+ File Format: 파일 형식
61
+ Translate to English?: 영어로 번역합니까? (위스퍼 모델 자체 번역 기능)
62
+ Add a timestamp to the end of the filename: 파일 이름 끝에 타임스태프 붙이기
63
+ Advanced Parameters: 고급 변수
64
+ Background Music Remover Filter: 배경 음악 제거 필터
65
+ Enabling this will remove background music: 받아쓰기 이전에 먼저 배경 음악 제거용 서브 모델을 활성화 합니다.
66
+ Enable Background Music Remover Filter: 배경 음악 제거 필터 활성화
67
+ Save separated files to output: 분리된 배경 음악 & 음성 파일 따로 출력 폴더에 저장
68
+ Offload sub model after removing background music: 배경 음악 제거 후 서브 모델을 비활성화 합니다. (VRAM 이 부족할 시 체크하세요.)
69
+ Voice Detection Filter: 목소리 감지 필터
70
+ Enable this to transcribe only detected voice: 서브 모델에 의해 목소리라고 판단된 부분만 받아쓰기를 진행합니다.
71
+ Enable Silero VAD Filter: Silero VAD 필터 활성화
72
+ Diarization: 화자 구분
73
+ Enable Diarization: 화자 구분 활성화
74
+ HuggingFace Token: 허깅페이스 토큰
75
+ This is only needed the first time you download the model: 모델을 처음 다운받을 때만 토큰이 필요합니다. 이미 다운로드 받으신 상태라면 입력하지 않아도 됩니다. 모델을 다운 받기 위해선 "https://huggingface.co/pyannote/speaker-diarization-3.1" 와 "https://huggingface.co/pyannote/segmentation-3.0" 에서 먼저 사용 지침에 동의하셔야 합니다.
76
+ Device: 디바이스
77
+ Youtube Link: 유튜브 링크
78
+ Youtube Thumbnail: 유튜브 썸네일
79
+ Youtube Title: 유튜브 제목
80
+ Youtube Description: 유튜브 설명
81
+ Record with Mic: 마이크로 녹음하세요
82
+ Upload Subtitle Files to translate here: 번역할 자막 파일을 업로드 하세요
83
+ Your Auth Key (API KEY): DeepL API 키
84
+ Source Language: 원본 언어
85
+ Target Language: 대상 언어
86
+ Pro User?: Pro 버전 사용자
87
+ TRANSLATE SUBTITLE FILE: 자막 파일 번역
88
+ Upload Audio Files to separate background music: 배경 음악을 분리할 오디오 파일을 업로드 하세요
89
+ Instrumental: 악기
90
+ Vocals: 보컬
91
+ SEPARATE BACKGROUND MUSIC: 배경 음악 분리
92
+
93
+ ja: # Japanese
94
+ Language: 言語
95
+ File: File
96
+ Youtube: Youtube
97
+ Mic: Mic
98
+ T2T Translation: T2T Translation
99
+ BGM Separation: BGM Separation
100
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
101
+ Output: Output
102
+ Downloadable output file: Downloadable output file
103
+ Upload File here: Upload File here
104
+ Model: Model
105
+ Automatic Detection: Automatic Detection
106
+ File Format: File Format
107
+ Translate to English?: Translate to English?
108
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
109
+ Advanced Parameters: Advanced Parameters
110
+ Background Music Remover Filter: Background Music Remover Filter
111
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
112
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
113
+ Save separated files to output: Save separated files to output
114
+ Offload sub model after removing background music: Offload sub model after removing background music
115
+ Voice Detection Filter: Voice Detection Filter
116
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
117
+ Enable Silero VAD Filter: Enable Silero VAD Filter
118
+ Diarization: Diarization
119
+ Enable Diarization: Enable Diarization
120
+ HuggingFace Token: HuggingFace Token
121
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
122
+ Device: Device
123
+ Youtube Link: Youtube Link
124
+ Youtube Thumbnail: Youtube Thumbnail
125
+ Youtube Title: Youtube Title
126
+ Youtube Description: Youtube Description
127
+ Record with Mic: Record with Mic
128
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
129
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
130
+ Source Language: Source Language
131
+ Target Language: Target Language
132
+ Pro User?: Pro User?
133
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
134
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
135
+ Instrumental: Instrumental
136
+ Vocals: Vocals
137
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
138
+
139
+ es: # Spanish
140
+ Language: Idioma
141
+ File: File
142
+ Youtube: Youtube
143
+ Mic: Mic
144
+ T2T Translation: T2T Translation
145
+ BGM Separation: BGM Separation
146
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
147
+ Output: Output
148
+ Downloadable output file: Downloadable output file
149
+ Upload File here: Upload File here
150
+ Model: Model
151
+ Automatic Detection: Automatic Detection
152
+ File Format: File Format
153
+ Translate to English?: Translate to English?
154
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
155
+ Advanced Parameters: Advanced Parameters
156
+ Background Music Remover Filter: Background Music Remover Filter
157
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
158
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
159
+ Save separated files to output: Save separated files to output
160
+ Offload sub model after removing background music: Offload sub model after removing background music
161
+ Voice Detection Filter: Voice Detection Filter
162
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
163
+ Enable Silero VAD Filter: Enable Silero VAD Filter
164
+ Diarization: Diarization
165
+ Enable Diarization: Enable Diarization
166
+ HuggingFace Token: HuggingFace Token
167
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
168
+ Device: Device
169
+ Youtube Link: Youtube Link
170
+ Youtube Thumbnail: Youtube Thumbnail
171
+ Youtube Title: Youtube Title
172
+ Youtube Description: Youtube Description
173
+ Record with Mic: Record with Mic
174
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
175
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
176
+ Source Language: Source Language
177
+ Target Language: Target Language
178
+ Pro User?: Pro User?
179
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
180
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
181
+ Instrumental: Instrumental
182
+ Vocals: Vocals
183
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
184
+
185
+ fr: # French
186
+ Language: Langue
187
+ File: File
188
+ Youtube: Youtube
189
+ Mic: Mic
190
+ T2T Translation: T2T Translation
191
+ BGM Separation: BGM Separation
192
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
193
+ Output: Output
194
+ Downloadable output file: Downloadable output file
195
+ Upload File here: Upload File here
196
+ Model: Model
197
+ Automatic Detection: Automatic Detection
198
+ File Format: File Format
199
+ Translate to English?: Translate to English?
200
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
201
+ Advanced Parameters: Advanced Parameters
202
+ Background Music Remover Filter: Background Music Remover Filter
203
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
204
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
205
+ Save separated files to output: Save separated files to output
206
+ Offload sub model after removing background music: Offload sub model after removing background music
207
+ Voice Detection Filter: Voice Detection Filter
208
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
209
+ Enable Silero VAD Filter: Enable Silero VAD Filter
210
+ Diarization: Diarization
211
+ Enable Diarization: Enable Diarization
212
+ HuggingFace Token: HuggingFace Token
213
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
214
+ Device: Device
215
+ Youtube Link: Youtube Link
216
+ Youtube Thumbnail: Youtube Thumbnail
217
+ Youtube Title: Youtube Title
218
+ Youtube Description: Youtube Description
219
+ Record with Mic: Record with Mic
220
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
221
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
222
+ Source Language: Source Language
223
+ Target Language: Target Language
224
+ Pro User?: Pro User?
225
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
226
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
227
+ Instrumental: Instrumental
228
+ Vocals: Vocals
229
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
230
+
231
+ de: # German
232
+ Language: Sprache
233
+ File: File
234
+ Youtube: Youtube
235
+ Mic: Mic
236
+ T2T Translation: T2T Translation
237
+ BGM Separation: BGM Separation
238
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
239
+ Output: Output
240
+ Downloadable output file: Downloadable output file
241
+ Upload File here: Upload File here
242
+ Model: Model
243
+ Automatic Detection: Automatic Detection
244
+ File Format: File Format
245
+ Translate to English?: Translate to English?
246
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
247
+ Advanced Parameters: Advanced Parameters
248
+ Background Music Remover Filter: Background Music Remover Filter
249
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
250
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
251
+ Save separated files to output: Save separated files to output
252
+ Offload sub model after removing background music: Offload sub model after removing background music
253
+ Voice Detection Filter: Voice Detection Filter
254
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
255
+ Enable Silero VAD Filter: Enable Silero VAD Filter
256
+ Diarization: Diarization
257
+ Enable Diarization: Enable Diarization
258
+ HuggingFace Token: HuggingFace Token
259
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
260
+ Device: Device
261
+ Youtube Link: Youtube Link
262
+ Youtube Thumbnail: Youtube Thumbnail
263
+ Youtube Title: Youtube Title
264
+ Youtube Description: Youtube Description
265
+ Record with Mic: Record with Mic
266
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
267
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
268
+ Source Language: Source Language
269
+ Target Language: Target Language
270
+ Pro User?: Pro User?
271
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
272
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
273
+ Instrumental: Instrumental
274
+ Vocals: Vocals
275
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
276
+
277
+ zh: # Chinese
278
+ Language: 语言
279
+ File: File
280
+ Youtube: Youtube
281
+ Mic: Mic
282
+ T2T Translation: T2T Translation
283
+ BGM Separation: BGM Separation
284
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
285
+ Output: Output
286
+ Downloadable output file: Downloadable output file
287
+ Upload File here: Upload File here
288
+ Model: Model
289
+ Automatic Detection: Automatic Detection
290
+ File Format: File Format
291
+ Translate to English?: Translate to English?
292
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
293
+ Advanced Parameters: Advanced Parameters
294
+ Background Music Remover Filter: Background Music Remover Filter
295
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
296
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
297
+ Save separated files to output: Save separated files to output
298
+ Offload sub model after removing background music: Offload sub model after removing background music
299
+ Voice Detection Filter: Voice Detection Filter
300
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
301
+ Enable Silero VAD Filter: Enable Silero VAD Filter
302
+ Diarization: Diarization
303
+ Enable Diarization: Enable Diarization
304
+ HuggingFace Token: HuggingFace Token
305
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
306
+ Device: Device
307
+ Youtube Link: Youtube Link
308
+ Youtube Thumbnail: Youtube Thumbnail
309
+ Youtube Title: Youtube Title
310
+ Youtube Description: Youtube Description
311
+ Record with Mic: Record with Mic
312
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
313
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
314
+ Source Language: Source Language
315
+ Target Language: Target Language
316
+ Pro User?: Pro User?
317
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
318
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
319
+ Instrumental: Instrumental
320
+ Vocals: Vocals
321
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
322
+
323
+ uk: # Ukrainian
324
+ Language: Мова
325
+ File: Файл
326
+ Youtube: Youtube
327
+ Mic: Мікрофон
328
+ T2T Translation: T2T Переклад
329
+ BGM Separation: Розділення фонової музики
330
+ GENERATE SUBTITLE FILE: СТВОРИТИ ФАЙЛ СУБТИТРІВ
331
+ Output: Результат
332
+ Downloadable output file: Завантажуваний файл результату
333
+ Upload File here: Завантажте файл тут
334
+ Model: Модель
335
+ Automatic Detection: Автоматичне визначення
336
+ File Format: Формат файлу
337
+ Translate to English?: Перекласти на англійську?
338
+ Add a timestamp to the end of the filename: Додати мітку часу до кінця імені файлу
339
+ Advanced Parameters: Розширені параметри
340
+ Background Music Remover Filter: Фільтр видалення фонової музики
341
+ Enabling this will remove background music: Увімкнення цього видалить фонову музику за допомогою підмоделі перед транскрипцією
342
+ Enable Background Music Remover Filter: Увімкнути фільтр видалення фонової музики
343
+ Save separated files to output: Зберегти розділені файли до вихідної папки
344
+ Offload sub model after removing background music: Вивантажити підмодель після видалення фонової музики
345
+ Voice Detection Filter: Фільтр розпізнавання голосу
346
+ Enable this to transcribe only detected voice: Увімкніть це, щоб транскрибувати лише розпізнані голосові частини за допомогою підмоделі
347
+ Enable Silero VAD Filter: Увімкнути фільтр Silero VAD
348
+ Diarization: Діаризація
349
+ Enable Diarization: Увімкнути діаризацію
350
+ HuggingFace Token: Токен HuggingFace
351
+ This is only needed the first time you download the model: Це потрібно лише при першому завантаженні моделі. Якщо у вас вже є моделі, вводити не потрібно. Щоб завантажити модель, потрібно вручну перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" та "https://huggingface.co/pyannote/segmentation-3.0" і погодитися з їхніми вимогами.
352
+ Device: Пристрій
353
+ Youtube Link: Посилання на Youtube
354
+ Youtube Thumbnail: Ескіз Youtube
355
+ Youtube Title: Назва Youtube
356
+ Youtube Description: Опис Youtube
357
+ Record with Mic: Записати з мікрофона
358
+ Upload Subtitle Files to translate here: Завантажте файли субтитрів для перекладу тут
359
+ Your Auth Key (API KEY): Ваш ключ авторизації (API KEY)
360
+ Source Language: Мова джерела
361
+ Target Language: Мова перекладу
362
+ Pro User?: Професійний користувач?
363
+ TRANSLATE SUBTITLE FILE: ПЕРЕКЛАСТИ ФАЙЛ СУБТИТРІВ
364
+ Upload Audio Files to separate background music: Завантажте аудіофайли для розділення фонової музики
365
+ Instrumental: Інструментал
366
+ Vocals: Вокал
367
+ SEPARATE BACKGROUND MUSIC: РОЗДІЛИТИ ФОНОВУ МУЗИКУ
368
+
369
+ ru: # Russian
370
+ Language: Язык
371
+ File: Файл
372
+ Youtube: Youtube
373
+ Mic: Микрофон
374
+ T2T Translation: Перевод T2T
375
+ BGM Separation: Разделение фоновой музыки
376
+ GENERATE SUBTITLE FILE: СГЕНЕРИРОВАТЬ ФАЙЛ СУБТИТРОВ
377
+ Output: Результат
378
+ Downloadable output file: Загружаемый файл результата
379
+ Upload File here: Загрузите файл здесь
380
+ Model: Модель
381
+ Automatic Detection: Автоматическое определение
382
+ File Format: Формат файла
383
+ Translate to English?: Перевести на английский?
384
+ Add a timestamp to the end of the filename: Добавить метку времени в конец имени файла
385
+ Advanced Parameters: Расширенные параметры
386
+ Background Music Remover Filter: Фильтр удаления фоновой музыки
387
+ Enabling this will remove background music: Включение этого удалит фоновую музыку с помощью подмодели перед транскрипцией
388
+ Enable Background Music Remover Filter: Включить фильтр удаления фоновой музыки
389
+ Save separated files to output: Сохранить разделенные файлы в выходную папку
390
+ Offload sub model after removing background music: Выгрузить подмодель после удаления фоновой музыки
391
+ Voice Detection Filter: Фильтр обнаружения голоса
392
+ Enable this to transcribe only detected voice: Включите это, чтобы транскрибировать только обнаруженные голосовые части с помощью подмодели
393
+ Enable Silero VAD Filter: Включить фильтр Silero VAD
394
+ Diarization: Диаризация
395
+ Enable Diarization: Включить диаризацию
396
+ HuggingFace Token: Токен HuggingFace
397
+ This is only needed the first time you download the model: Это нужно только при первом скачивании модели. Если у вас уже есть модели, вводить не нужно. Чтобы скачать модель, нужно вручную перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" и "https://huggingface.co/pyannote/segmentation-3.0" и согласиться с их требованиями.
398
+ Device: Устройство
399
+ Youtube Link: Ссылка на Youtube
400
+ Youtube Thumbnail: Миниатюра Youtube
401
+ Youtube Title: Название Youtube
402
+ Youtube Description: Описание Youtube
403
+ Record with Mic: Записать с микрофона
404
+ Upload Subtitle Files to translate here: Загрузите файлы субтитров для перевода здесь
405
+ Your Auth Key (API KEY): Ваш Auth Key (API KEY)
406
+ Source Language: Исходный язык
407
+ Target Language: Целевой язык
408
+ Pro User?: Профессиональный пользователь?
409
+ TRANSLATE SUBTITLE FILE: ПЕРЕВЕСТИ ФАЙЛ СУБТИТРОВ
410
+ Upload Audio Files to separate background music: Загрузите аудиофайлы для разделения фоновой музыки
411
+ Instrumental: Инструментал
412
+ Vocals: Вокал
413
+ SEPARATE BACKGROUND MUSIC: РАЗДЕЛИТЬ ФОНОВУЮ МУЗЫКУ
414
+
415
+ tr: # Turkish
416
+ Language: Dil
417
+ File: Dosya
418
+ Youtube: Youtube
419
+ Mic: Mikrofon
420
+ T2T Translation: T2T Çeviri
421
+ BGM Separation: Arka Plan Müziği Ayırma
422
+ GENERATE SUBTITLE FILE: ALTYAZI DOSYASI OLUŞTUR
423
+ Output: Çıktı
424
+ Downloadable output file: İndirilebilir çıktı dosyası
425
+ Upload File here: Dosya Yükle
426
+ Model: Model
427
+ Automatic Detection: Otomatik Algılama
428
+ File Format: Dosya Formatı
429
+ Translate to English?: İngilizceye Çevir?
430
+ Add a timestamp to the end of the filename: Dosya adının sonuna zaman damgası ekle
431
+ Advanced Parameters: Gelişmiş Parametreler
432
+ Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresi
433
+ Enabling this will remove background music: Bunu etkinleştirmek, arka plan müziğini alt model tarafından transkripsiyondan önce kaldıracaktır
434
+ Enable Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresini Etkinleştir
435
+ Save separated files to output: Ayrılmış dosyaları çıktıya kaydet
436
+ Offload sub model after removing background music: Arka plan müziği kaldırıldıktan sonra alt modeli devre dışı bırak
437
+ Voice Detection Filter: Ses Algılama Filtresi
438
+ Enable this to transcribe only detected voice: Bunu etkinleştirerek yalnızca alt model tarafından algılanan ses kısımlarını transkribe et
439
+ Enable Silero VAD Filter: Silero VAD Filtresini Etkinleştir
440
+ Diarization: Konuşmacı Ayrımı
441
+ Enable Diarization: Konuşmacı Ayrımını Etkinleştir
442
+ HuggingFace Token: HuggingFace Anahtarı
443
+ This is only needed the first time you download the model: Bu, modeli ilk kez indirirken gereklidir. Zaten modelleriniz varsa girmenize gerek yok. Modeli indirmek için "https://huggingface.co/pyannote/speaker-diarization-3.1" ve "https://huggingface.co/pyannote/segmentation-3.0" adreslerine gidip gereksinimlerini kabul etmeniz gerekiyor
444
+ Device: Cihaz
445
+ Youtube Link: Youtube Bağlantısı
446
+ Youtube Thumbnail: Youtube Küçük Resmi
447
+ Youtube Title: Youtube Başlığı
448
+ Youtube Description: Youtube Açıklaması
449
+ Record with Mic: Mikrofonla Kaydet
450
+ Upload Subtitle Files to translate here: Çeviri için altyazı dosyalarını buraya yükle
451
+ Your Auth Key (API KEY): Yetki Anahtarınız (API ANAHTARI)
452
+ Source Language: Kaynak Dil
453
+ Target Language: Hedef Dil
454
+ Pro User?: Pro Kullanıcı?
455
+ TRANSLATE SUBTITLE FILE: ALTYAZI DOSYASINI ÇEVİR
456
+ Upload Audio Files to separate background music: Arka plan müziğini ayırmak için ses dosyalarını yükle
457
+ Instrumental: Enstrümantal
458
+ Vocals: Vokal
459
+ SEPARATE BACKGROUND MUSIC: ARKA PLAN MÜZİĞİNİ AYIR
modules/diarize/diarize_pipeline.py CHANGED
@@ -7,6 +7,7 @@ from pyannote.audio import Pipeline
7
  from typing import Optional, Union
8
  import torch
9
 
 
10
  from modules.utils.paths import DIARIZATION_MODELS_DIR
11
  from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
12
 
@@ -43,6 +44,8 @@ class DiarizationPipeline:
43
 
44
  def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
45
  transcript_segments = transcript_result["segments"]
 
 
46
  for seg in transcript_segments:
47
  # assign speaker to segment (if any)
48
  diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
@@ -63,7 +66,7 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
63
  seg["speaker"] = speaker
64
 
65
  # assign speaker to words
66
- if 'words' in seg:
67
  for word in seg['words']:
68
  if 'start' in word:
69
  diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
@@ -85,10 +88,10 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
85
  if word_speaker is not None:
86
  word["speaker"] = word_speaker
87
 
88
- return transcript_result
89
 
90
 
91
- class Segment:
92
  def __init__(self, start, end, speaker=None):
93
  self.start = start
94
  self.end = end
 
7
  from typing import Optional, Union
8
  import torch
9
 
10
+ from modules.whisper.data_classes import *
11
  from modules.utils.paths import DIARIZATION_MODELS_DIR
12
  from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
13
 
 
44
 
45
  def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
46
  transcript_segments = transcript_result["segments"]
47
+ if transcript_segments and isinstance(transcript_segments[0], Segment):
48
+ transcript_segments = [seg.model_dump() for seg in transcript_segments]
49
  for seg in transcript_segments:
50
  # assign speaker to segment (if any)
51
  diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
 
66
  seg["speaker"] = speaker
67
 
68
  # assign speaker to words
69
+ if 'words' in seg and seg['words'] is not None:
70
  for word in seg['words']:
71
  if 'start' in word:
72
  diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
 
88
  if word_speaker is not None:
89
  word["speaker"] = word_speaker
90
 
91
+ return {"segments": transcript_segments}
92
 
93
 
94
+ class DiarizationSegment:
95
  def __init__(self, start, end, speaker=None):
96
  self.start = start
97
  self.end = end
modules/diarize/diarizer.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import torch
3
- from typing import List, Union, BinaryIO, Optional
4
  import numpy as np
5
  import time
6
  import logging
@@ -9,6 +9,7 @@ import spaces
9
  from modules.utils.paths import DIARIZATION_MODELS_DIR
10
  from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
11
  from modules.diarize.audio_loader import load_audio
 
12
 
13
 
14
  class Diarizer:
@@ -25,10 +26,10 @@ class Diarizer:
25
  @spaces.GPU
26
  def run(self,
27
  audio: Union[str, BinaryIO, np.ndarray],
28
- transcribed_result: List[dict],
29
  use_auth_token: str,
30
  device: Optional[str] = None
31
- ):
32
  """
33
  Diarize transcribed result as a post-processing
34
 
@@ -36,7 +37,7 @@ class Diarizer:
36
  ----------
37
  audio: Union[str, BinaryIO, np.ndarray]
38
  Audio input. This can be file path or binary type.
39
- transcribed_result: List[dict]
40
  transcribed result through whisper.
41
  use_auth_token: str
42
  Huggingface token with READ permission. This is only needed the first time you download the model.
@@ -46,8 +47,8 @@ class Diarizer:
46
 
47
  Returns
48
  ----------
49
- segments_result: List[dict]
50
- list of dicts that includes start, end timestamps and transcribed text
51
  elapsed_time: float
52
  elapsed time for running
53
  """
@@ -70,14 +71,20 @@ class Diarizer:
70
  {"segments": transcribed_result}
71
  )
72
 
 
73
  for segment in diarized_result["segments"]:
74
  speaker = "None"
75
  if "speaker" in segment:
76
  speaker = segment["speaker"]
77
- segment["text"] = speaker + "|" + segment["text"].strip()
 
 
 
 
 
78
 
79
  elapsed_time = time.time() - start_time
80
- return diarized_result["segments"], elapsed_time
81
 
82
  @spaces.GPU
83
  def update_pipe(self,
 
1
  import os
2
  import torch
3
+ from typing import List, Union, BinaryIO, Optional, Tuple
4
  import numpy as np
5
  import time
6
  import logging
 
9
  from modules.utils.paths import DIARIZATION_MODELS_DIR
10
  from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
11
  from modules.diarize.audio_loader import load_audio
12
+ from modules.whisper.data_classes import *
13
 
14
 
15
  class Diarizer:
 
26
  @spaces.GPU
27
  def run(self,
28
  audio: Union[str, BinaryIO, np.ndarray],
29
+ transcribed_result: List[Segment],
30
  use_auth_token: str,
31
  device: Optional[str] = None
32
+ ) -> Tuple[List[Segment], float]:
33
  """
34
  Diarize transcribed result as a post-processing
35
 
 
37
  ----------
38
  audio: Union[str, BinaryIO, np.ndarray]
39
  Audio input. This can be file path or binary type.
40
+ transcribed_result: List[Segment]
41
  transcribed result through whisper.
42
  use_auth_token: str
43
  Huggingface token with READ permission. This is only needed the first time you download the model.
 
47
 
48
  Returns
49
  ----------
50
+ segments_result: List[Segment]
51
+ list of Segment that includes start, end timestamps and transcribed text
52
  elapsed_time: float
53
  elapsed time for running
54
  """
 
71
  {"segments": transcribed_result}
72
  )
73
 
74
+ segments_result = []
75
  for segment in diarized_result["segments"]:
76
  speaker = "None"
77
  if "speaker" in segment:
78
  speaker = segment["speaker"]
79
+ diarized_text = speaker + "|" + segment["text"].strip()
80
+ segments_result.append(Segment(
81
+ start=segment["start"],
82
+ end=segment["end"],
83
+ text=diarized_text
84
+ ))
85
 
86
  elapsed_time = time.time() - start_time
87
+ return segments_result, elapsed_time
88
 
89
  @spaces.GPU
90
  def update_pipe(self,
modules/translation/deepl_api.py CHANGED
@@ -5,6 +5,7 @@ from datetime import datetime
5
  import gradio as gr
6
 
7
  from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
 
8
  from modules.utils.subtitle_manager import *
9
  from modules.utils.files_manager import load_yaml, save_yaml
10
 
@@ -50,7 +51,7 @@ DEEPL_AVAILABLE_TARGET_LANGS = {
50
  }
51
 
52
  DEEPL_AVAILABLE_SOURCE_LANGS = {
53
- 'Automatic Detection': None,
54
  'Bulgarian': 'BG',
55
  'Czech': 'CS',
56
  'Danish': 'DA',
@@ -138,37 +139,27 @@ class DeepLAPI:
138
  )
139
 
140
  files_info = {}
141
- for fileobj in fileobjs:
142
- file_path = fileobj
143
- file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
144
-
145
- if file_ext == ".srt":
146
- parsed_dicts = parse_srt(file_path=file_path)
147
-
148
- elif file_ext == ".vtt":
149
- parsed_dicts = parse_vtt(file_path=file_path)
150
 
151
  batch_size = self.max_text_batch_size
152
- for batch_start in range(0, len(parsed_dicts), batch_size):
153
- batch_end = min(batch_start + batch_size, len(parsed_dicts))
154
- sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
155
  translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
156
  target_lang, is_pro)
157
  for i, translated_text in enumerate(translated_texts):
158
- parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
159
- progress(batch_end / len(parsed_dicts), desc="Translating..")
160
-
161
- if file_ext == ".srt":
162
- subtitle = get_serialized_srt(parsed_dicts)
163
- elif file_ext == ".vtt":
164
- subtitle = get_serialized_vtt(parsed_dicts)
165
-
166
- if add_timestamp:
167
- timestamp = datetime.now().strftime("%m%d%H%M%S")
168
- file_name += f"-{timestamp}"
169
-
170
- output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
171
- write_file(subtitle, output_path)
172
 
173
  files_info[file_name] = {"subtitle": subtitle, "path": output_path}
174
 
 
5
  import gradio as gr
6
 
7
  from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
8
+ from modules.utils.constants import AUTOMATIC_DETECTION
9
  from modules.utils.subtitle_manager import *
10
  from modules.utils.files_manager import load_yaml, save_yaml
11
 
 
51
  }
52
 
53
  DEEPL_AVAILABLE_SOURCE_LANGS = {
54
+ AUTOMATIC_DETECTION: None,
55
  'Bulgarian': 'BG',
56
  'Czech': 'CS',
57
  'Danish': 'DA',
 
139
  )
140
 
141
  files_info = {}
142
+ for file_path in fileobjs:
143
+ file_name, file_ext = os.path.splitext(os.path.basename(file_path))
144
+ writer = get_writer(file_ext, self.output_dir)
145
+ segments = writer.to_segments(file_path)
 
 
 
 
 
146
 
147
  batch_size = self.max_text_batch_size
148
+ for batch_start in range(0, len(segments), batch_size):
149
+ progress(batch_start / len(segments), desc="Translating..")
150
+ sentences_to_translate = [seg.text for seg in segments[batch_start:batch_start+batch_size]]
151
  translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
152
  target_lang, is_pro)
153
  for i, translated_text in enumerate(translated_texts):
154
+ segments[batch_start + i].text = translated_text["text"]
155
+
156
+ subtitle, output_path = generate_file(
157
+ output_dir=self.output_dir,
158
+ output_file_name=file_name,
159
+ output_format=file_ext,
160
+ result=segments,
161
+ add_timestamp=add_timestamp
162
+ )
 
 
 
 
 
163
 
164
  files_info[file_name] = {"subtitle": subtitle, "path": output_path}
165
 
modules/translation/nllb_inference.py CHANGED
@@ -4,10 +4,10 @@ import os
4
  import spaces
5
 
6
  from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
7
- from modules.translation.translation_base import TranslationBase
8
 
9
 
10
- class NLLBInference(TranslationBase):
11
  def __init__(self,
12
  model_dir: str = NLLB_MODELS_DIR,
13
  output_dir: str = TRANSLATION_OUTPUT_DIR
@@ -31,7 +31,7 @@ class NLLBInference(TranslationBase):
31
  text,
32
  max_length=max_length
33
  )
34
- return result[0]['translation_text']
35
 
36
  @spaces.GPU(duration=120)
37
  def update_model(self,
@@ -44,8 +44,7 @@ class NLLBInference(TranslationBase):
44
  if lang in NLLB_AVAILABLE_LANGS:
45
  return NLLB_AVAILABLE_LANGS[lang]
46
  elif lang not in NLLB_AVAILABLE_LANGS.values():
47
- raise ValueError(
48
- f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
49
  return lang
50
 
51
  src_lang = validate_language(src_lang)
 
4
  import spaces
5
 
6
  from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
7
+ import modules.translation.translation_base as base
8
 
9
 
10
+ class NLLBInference(base.TranslationBase):
11
  def __init__(self,
12
  model_dir: str = NLLB_MODELS_DIR,
13
  output_dir: str = TRANSLATION_OUTPUT_DIR
 
31
  text,
32
  max_length=max_length
33
  )
34
+ return result[0]["translation_text"]
35
 
36
  @spaces.GPU(duration=120)
37
  def update_model(self,
 
44
  if lang in NLLB_AVAILABLE_LANGS:
45
  return NLLB_AVAILABLE_LANGS[lang]
46
  elif lang not in NLLB_AVAILABLE_LANGS.values():
47
+ raise ValueError(f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
 
48
  return lang
49
 
50
  src_lang = validate_language(src_lang)
modules/translation/translation_base.py CHANGED
@@ -6,7 +6,8 @@ from typing import List
6
  from datetime import datetime
7
  import spaces
8
 
9
- from modules.whisper.whisper_parameter import *
 
10
  from modules.utils.subtitle_manager import *
11
  from modules.utils.files_manager import load_yaml, save_yaml
12
  from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
@@ -98,32 +99,22 @@ class TranslationBase(ABC):
98
  files_info = {}
99
  for fileobj in fileobjs:
100
  file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
101
- if file_ext == ".srt":
102
- parsed_dicts = parse_srt(file_path=fileobj)
103
- total_progress = len(parsed_dicts)
104
- for index, dic in enumerate(parsed_dicts):
105
- progress(index / total_progress, desc="Translating..")
106
- translated_text = self.translate(dic["sentence"], max_length=max_length)
107
- dic["sentence"] = translated_text
108
- subtitle = get_serialized_srt(parsed_dicts)
109
-
110
- elif file_ext == ".vtt":
111
- parsed_dicts = parse_vtt(file_path=fileobj)
112
- total_progress = len(parsed_dicts)
113
- for index, dic in enumerate(parsed_dicts):
114
- progress(index / total_progress, desc="Translating..")
115
- translated_text = self.translate(dic["sentence"], max_length=max_length)
116
- dic["sentence"] = translated_text
117
- subtitle = get_serialized_vtt(parsed_dicts)
118
-
119
- if add_timestamp:
120
- timestamp = datetime.now().strftime("%m%d%H%M%S")
121
- file_name += f"-{timestamp}"
122
-
123
- output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
124
- write_file(subtitle, output_path)
125
-
126
- files_info[file_name] = {"subtitle": subtitle, "path": output_path}
127
 
128
  total_result = ''
129
  for file_name, info in files_info.items():
@@ -136,7 +127,8 @@ class TranslationBase(ABC):
136
  return [gr_str, output_file_paths]
137
 
138
  except Exception as e:
139
- print(f"Error: {str(e)}")
 
140
  finally:
141
  self.release_cuda_memory()
142
 
@@ -172,11 +164,17 @@ class TranslationBase(ABC):
172
  tgt_lang: str,
173
  max_length: int,
174
  add_timestamp: bool):
 
 
 
 
 
 
175
  cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
176
  cached_params["translation"]["nllb"] = {
177
  "model_size": model_size,
178
- "source_lang": src_lang,
179
- "target_lang": tgt_lang,
180
  "max_length": max_length,
181
  }
182
  cached_params["translation"]["add_timestamp"] = add_timestamp
 
6
  from datetime import datetime
7
  import spaces
8
 
9
+ import modules.translation.nllb_inference as nllb
10
+ from modules.whisper.data_classes import *
11
  from modules.utils.subtitle_manager import *
12
  from modules.utils.files_manager import load_yaml, save_yaml
13
  from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
 
99
  files_info = {}
100
  for fileobj in fileobjs:
101
  file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
102
+ writer = get_writer(file_ext, self.output_dir)
103
+ segments = writer.to_segments(fileobj)
104
+ for i, segment in enumerate(segments):
105
+ progress(i / len(segments), desc="Translating..")
106
+ translated_text = self.translate(segment.text, max_length=max_length)
107
+ segment.text = translated_text
108
+
109
+ subtitle, file_path = generate_file(
110
+ output_dir=self.output_dir,
111
+ output_file_name=file_name,
112
+ output_format=file_ext,
113
+ result=segments,
114
+ add_timestamp=add_timestamp
115
+ )
116
+
117
+ files_info[file_name] = {"subtitle": subtitle, "path": file_path}
 
 
 
 
 
 
 
 
 
 
118
 
119
  total_result = ''
120
  for file_name, info in files_info.items():
 
127
  return [gr_str, output_file_paths]
128
 
129
  except Exception as e:
130
+ print(f"Error translating file: {e}")
131
+ raise
132
  finally:
133
  self.release_cuda_memory()
134
 
 
164
  tgt_lang: str,
165
  max_length: int,
166
  add_timestamp: bool):
167
+ def validate_lang(lang: str):
168
+ if lang in list(nllb.NLLB_AVAILABLE_LANGS.values()):
169
+ flipped = {value: key for key, value in nllb.NLLB_AVAILABLE_LANGS.items()}
170
+ return flipped[lang]
171
+ return lang
172
+
173
  cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
174
  cached_params["translation"]["nllb"] = {
175
  "model_size": model_size,
176
+ "source_lang": validate_lang(src_lang),
177
+ "target_lang": validate_lang(tgt_lang),
178
  "max_length": max_length,
179
  }
180
  cached_params["translation"]["add_timestamp"] = add_timestamp
modules/utils/constants.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from gradio_i18n import Translate, gettext as _
2
+
3
+ AUTOMATIC_DETECTION = _("Automatic Detection")
4
+ GRADIO_NONE_STR = ""
5
+ GRADIO_NONE_NUMBER_MAX = 9999
6
+ GRADIO_NONE_NUMBER_MIN = 0
modules/utils/files_manager.py CHANGED
@@ -67,3 +67,9 @@ def is_video(file_path):
67
  video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
68
  extension = os.path.splitext(file_path)[1].lower()
69
  return extension in video_extensions
 
 
 
 
 
 
 
67
  video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
68
  extension = os.path.splitext(file_path)[1].lower()
69
  return extension in video_extensions
70
+
71
+
72
+ def read_file(file_path):
73
+ with open(file_path, "r", encoding="utf-8") as f:
74
+ subtitle_content = f.read()
75
+ return subtitle_content
modules/utils/paths.py CHANGED
@@ -10,6 +10,7 @@ DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
10
  UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
11
  CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
12
  DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
 
13
  OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
14
  TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
15
  UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
 
10
  UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
11
  CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
12
  DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
13
+ I18N_YAML_PATH = os.path.join(CONFIGS_DIR, "translation.yaml")
14
  OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
15
  TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
16
  UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
modules/utils/subtitle_manager.py CHANGED
@@ -1,123 +1,427 @@
 
 
 
 
1
  import re
 
 
 
 
 
 
 
2
 
3
  # Zero GPU
4
  import spaces
5
 
6
- def timeformat_srt(time):
7
- hours = time // 3600
8
- minutes = (time - hours * 3600) // 60
9
- seconds = time - hours * 3600 - minutes * 60
10
- milliseconds = (time - int(time)) * 1000
11
- return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
12
-
13
-
14
- def timeformat_vtt(time):
15
- hours = time // 3600
16
- minutes = (time - hours * 3600) // 60
17
- seconds = time - hours * 3600 - minutes * 60
18
- milliseconds = (time - int(time)) * 1000
19
- return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
20
-
21
-
22
- def write_file(subtitle, output_file):
23
- with open(output_file, 'w', encoding='utf-8') as f:
24
- f.write(subtitle)
25
-
26
-
27
- def get_srt(segments):
28
- output = ""
29
- for i, segment in enumerate(segments):
30
- output += f"{i + 1}\n"
31
- output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
32
- if segment['text'].startswith(' '):
33
- segment['text'] = segment['text'][1:]
34
- output += f"{segment['text']}\n\n"
35
- return output
36
-
37
-
38
- def get_vtt(segments):
39
- output = "WebVTT\n\n"
40
- for i, segment in enumerate(segments):
41
- output += f"{i + 1}\n"
42
- output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
43
- if segment['text'].startswith(' '):
44
- segment['text'] = segment['text'][1:]
45
- output += f"{segment['text']}\n\n"
46
- return output
47
-
48
-
49
- def get_txt(segments):
50
- output = ""
51
- for i, segment in enumerate(segments):
52
- if segment['text'].startswith(' '):
53
- segment['text'] = segment['text'][1:]
54
- output += f"{segment['text']}\n"
55
- return output
56
-
57
-
58
- def parse_srt(file_path):
59
- """Reads SRT file and returns as dict"""
60
- with open(file_path, 'r', encoding='utf-8') as file:
61
- srt_data = file.read()
62
-
63
- data = []
64
- blocks = srt_data.split('\n\n')
65
-
66
- for block in blocks:
67
- if block.strip() != '':
68
- lines = block.strip().split('\n')
69
- index = lines[0]
70
- timestamp = lines[1]
71
- sentence = ' '.join(lines[2:])
72
-
73
- data.append({
74
- "index": index,
75
- "timestamp": timestamp,
76
- "sentence": sentence
77
- })
78
- return data
79
-
80
-
81
- def parse_vtt(file_path):
82
- """Reads WebVTT file and returns as dict"""
83
- with open(file_path, 'r', encoding='utf-8') as file:
84
- webvtt_data = file.read()
85
-
86
- data = []
87
- blocks = webvtt_data.split('\n\n')
88
-
89
- for block in blocks:
90
- if block.strip() != '' and not block.strip().startswith("WebVTT"):
91
- lines = block.strip().split('\n')
92
- index = lines[0]
93
- timestamp = lines[1]
94
- sentence = ' '.join(lines[2:])
95
-
96
- data.append({
97
- "index": index,
98
- "timestamp": timestamp,
99
- "sentence": sentence
100
- })
101
-
102
- return data
103
-
104
-
105
- def get_serialized_srt(dicts):
106
- output = ""
107
- for dic in dicts:
108
- output += f'{dic["index"]}\n'
109
- output += f'{dic["timestamp"]}\n'
110
- output += f'{dic["sentence"]}\n\n'
111
- return output
112
-
113
-
114
- def get_serialized_vtt(dicts):
115
- output = "WebVTT\n\n"
116
- for dic in dicts:
117
- output += f'{dic["index"]}\n'
118
- output += f'{dic["timestamp"]}\n'
119
- output += f'{dic["sentence"]}\n\n'
120
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  @spaces.GPU(duration=120)
123
  def safe_filename(name):
 
1
+ # Ported from https://github.com/openai/whisper/blob/main/whisper/utils.py
2
+
3
+ import json
4
+ import os
5
  import re
6
+ import sys
7
+ import zlib
8
+ from typing import Callable, List, Optional, TextIO, Union, Dict, Tuple
9
+ from datetime import datetime
10
+
11
+ from modules.whisper.data_classes import Segment, Word
12
+ from .files_manager import read_file
13
 
14
  # Zero GPU
15
  import spaces
16
 
17
+ def format_timestamp(
18
+ seconds: float, always_include_hours: bool = True, decimal_marker: str = ","
19
+ ) -> str:
20
+ assert seconds >= 0, "non-negative timestamp expected"
21
+ milliseconds = round(seconds * 1000.0)
22
+
23
+ hours = milliseconds // 3_600_000
24
+ milliseconds -= hours * 3_600_000
25
+
26
+ minutes = milliseconds // 60_000
27
+ milliseconds -= minutes * 60_000
28
+
29
+ seconds = milliseconds // 1_000
30
+ milliseconds -= seconds * 1_000
31
+
32
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
33
+ return (
34
+ f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
35
+ )
36
+
37
+
38
+ def time_str_to_seconds(time_str: str, decimal_marker: str = ",") -> float:
39
+ times = time_str.split(":")
40
+
41
+ if len(times) == 3:
42
+ hours, minutes, rest = times
43
+ hours = int(hours)
44
+ else:
45
+ hours = 0
46
+ minutes, rest = times
47
+
48
+ seconds, fractional = rest.split(decimal_marker)
49
+
50
+ minutes = int(minutes)
51
+ seconds = int(seconds)
52
+ fractional_seconds = float("0." + fractional)
53
+
54
+ return hours * 3600 + minutes * 60 + seconds + fractional_seconds
55
+
56
+
57
+ def get_start(segments: List[dict]) -> Optional[float]:
58
+ return next(
59
+ (w["start"] for s in segments for w in s["words"]),
60
+ segments[0]["start"] if segments else None,
61
+ )
62
+
63
+
64
+ def get_end(segments: List[dict]) -> Optional[float]:
65
+ return next(
66
+ (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
67
+ segments[-1]["end"] if segments else None,
68
+ )
69
+
70
+
71
+ class ResultWriter:
72
+ extension: str
73
+
74
+ def __init__(self, output_dir: str):
75
+ self.output_dir = output_dir
76
+
77
+ def __call__(
78
+ self, result: Union[dict, List[Segment]], output_file_name: str,
79
+ options: Optional[dict] = None, **kwargs
80
+ ):
81
+ if isinstance(result, List) and result and isinstance(result[0], Segment):
82
+ result = {"segments": [seg.model_dump() for seg in result]}
83
+
84
+ output_path = os.path.join(
85
+ self.output_dir, output_file_name + "." + self.extension
86
+ )
87
+
88
+ with open(output_path, "w", encoding="utf-8") as f:
89
+ self.write_result(result, file=f, options=options, **kwargs)
90
+
91
+ def write_result(
92
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
93
+ ):
94
+ raise NotImplementedError
95
+
96
+
97
+ class WriteTXT(ResultWriter):
98
+ extension: str = "txt"
99
+
100
+ def write_result(
101
+ self, result: Union[Dict, List[Segment]], file: TextIO, options: Optional[dict] = None, **kwargs
102
+ ):
103
+ for segment in result["segments"]:
104
+ print(segment["text"].strip(), file=file, flush=True)
105
+
106
+
107
+ class SubtitlesWriter(ResultWriter):
108
+ always_include_hours: bool
109
+ decimal_marker: str
110
+
111
+ def iterate_result(
112
+ self,
113
+ result: dict,
114
+ options: Optional[dict] = None,
115
+ *,
116
+ max_line_width: Optional[int] = None,
117
+ max_line_count: Optional[int] = None,
118
+ highlight_words: bool = False,
119
+ align_lrc_words: bool = False,
120
+ max_words_per_line: Optional[int] = None,
121
+ ):
122
+ options = options or {}
123
+ max_line_width = max_line_width or options.get("max_line_width")
124
+ max_line_count = max_line_count or options.get("max_line_count")
125
+ highlight_words = highlight_words or options.get("highlight_words", False)
126
+ align_lrc_words = align_lrc_words or options.get("align_lrc_words", False)
127
+ max_words_per_line = max_words_per_line or options.get("max_words_per_line")
128
+ preserve_segments = max_line_count is None or max_line_width is None
129
+ max_line_width = max_line_width or 1000
130
+ max_words_per_line = max_words_per_line or 1000
131
+
132
+ def iterate_subtitles():
133
+ line_len = 0
134
+ line_count = 1
135
+ # the next subtitle to yield (a list of word timings with whitespace)
136
+ subtitle: List[dict] = []
137
+ last: float = get_start(result["segments"]) or 0.0
138
+ for segment in result["segments"]:
139
+ chunk_index = 0
140
+ words_count = max_words_per_line
141
+ while chunk_index < len(segment["words"]):
142
+ remaining_words = len(segment["words"]) - chunk_index
143
+ if max_words_per_line > len(segment["words"]) - chunk_index:
144
+ words_count = remaining_words
145
+ for i, original_timing in enumerate(
146
+ segment["words"][chunk_index : chunk_index + words_count]
147
+ ):
148
+ timing = original_timing.copy()
149
+ long_pause = (
150
+ not preserve_segments and timing["start"] - last > 3.0
151
+ )
152
+ has_room = line_len + len(timing["word"]) <= max_line_width
153
+ seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
154
+ if (
155
+ line_len > 0
156
+ and has_room
157
+ and not long_pause
158
+ and not seg_break
159
+ ):
160
+ # line continuation
161
+ line_len += len(timing["word"])
162
+ else:
163
+ # new line
164
+ timing["word"] = timing["word"].strip()
165
+ if (
166
+ len(subtitle) > 0
167
+ and max_line_count is not None
168
+ and (long_pause or line_count >= max_line_count)
169
+ or seg_break
170
+ ):
171
+ # subtitle break
172
+ yield subtitle
173
+ subtitle = []
174
+ line_count = 1
175
+ elif line_len > 0:
176
+ # line break
177
+ line_count += 1
178
+ timing["word"] = "\n" + timing["word"]
179
+ line_len = len(timing["word"].strip())
180
+ subtitle.append(timing)
181
+ last = timing["start"]
182
+ chunk_index += max_words_per_line
183
+ if len(subtitle) > 0:
184
+ yield subtitle
185
+
186
+ if len(result["segments"]) > 0 and "words" in result["segments"][0] and result["segments"][0]["words"]:
187
+ for subtitle in iterate_subtitles():
188
+ subtitle_start = self.format_timestamp(subtitle[0]["start"])
189
+ subtitle_end = self.format_timestamp(subtitle[-1]["end"])
190
+ subtitle_text = "".join([word["word"] for word in subtitle])
191
+ if highlight_words:
192
+ last = subtitle_start
193
+ all_words = [timing["word"] for timing in subtitle]
194
+ for i, this_word in enumerate(subtitle):
195
+ start = self.format_timestamp(this_word["start"])
196
+ end = self.format_timestamp(this_word["end"])
197
+ if last != start:
198
+ yield last, start, subtitle_text
199
+
200
+ yield start, end, "".join(
201
+ [
202
+ re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
203
+ if j == i
204
+ else word
205
+ for j, word in enumerate(all_words)
206
+ ]
207
+ )
208
+ last = end
209
+
210
+ if align_lrc_words:
211
+ lrc_aligned_words = [f"[{self.format_timestamp(sub['start'])}]{sub['word']}" for sub in subtitle]
212
+ l_start, l_end = self.format_timestamp(subtitle[-1]['start']), self.format_timestamp(subtitle[-1]['end'])
213
+ lrc_aligned_words[-1] = f"[{l_start}]{subtitle[-1]['word']}[{l_end}]"
214
+ lrc_aligned_words = ' '.join(lrc_aligned_words)
215
+ yield None, None, lrc_aligned_words
216
+
217
+ else:
218
+ yield subtitle_start, subtitle_end, subtitle_text
219
+ else:
220
+ for segment in result["segments"]:
221
+ segment_start = self.format_timestamp(segment["start"])
222
+ segment_end = self.format_timestamp(segment["end"])
223
+ segment_text = segment["text"].strip().replace("-->", "->")
224
+ yield segment_start, segment_end, segment_text
225
+
226
+ def format_timestamp(self, seconds: float):
227
+ return format_timestamp(
228
+ seconds=seconds,
229
+ always_include_hours=self.always_include_hours,
230
+ decimal_marker=self.decimal_marker,
231
+ )
232
+
233
+
234
+ class WriteVTT(SubtitlesWriter):
235
+ extension: str = "vtt"
236
+ always_include_hours: bool = False
237
+ decimal_marker: str = "."
238
+
239
+ def write_result(
240
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
241
+ ):
242
+ print("WEBVTT\n", file=file)
243
+ for start, end, text in self.iterate_result(result, options, **kwargs):
244
+ print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
245
+
246
+ def to_segments(self, file_path: str) -> List[Segment]:
247
+ segments = []
248
+
249
+ blocks = read_file(file_path).split('\n\n')
250
+
251
+ for block in blocks:
252
+ if block.strip() != '' and not block.strip().startswith("WEBVTT"):
253
+ lines = block.strip().split('\n')
254
+ time_line = lines[0].split(" --> ")
255
+ start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
256
+ sentence = ' '.join(lines[1:])
257
+
258
+ segments.append(Segment(
259
+ start=start,
260
+ end=end,
261
+ text=sentence
262
+ ))
263
+
264
+ return segments
265
+
266
+
267
+ class WriteSRT(SubtitlesWriter):
268
+ extension: str = "srt"
269
+ always_include_hours: bool = True
270
+ decimal_marker: str = ","
271
+
272
+ def write_result(
273
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
274
+ ):
275
+ for i, (start, end, text) in enumerate(
276
+ self.iterate_result(result, options, **kwargs), start=1
277
+ ):
278
+ print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
279
+
280
+ def to_segments(self, file_path: str) -> List[Segment]:
281
+ segments = []
282
+
283
+ blocks = read_file(file_path).split('\n\n')
284
+
285
+ for block in blocks:
286
+ if block.strip() != '':
287
+ lines = block.strip().split('\n')
288
+ index = lines[0]
289
+ time_line = lines[1].split(" --> ")
290
+ start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
291
+ sentence = ' '.join(lines[2:])
292
+
293
+ segments.append(Segment(
294
+ start=start,
295
+ end=end,
296
+ text=sentence
297
+ ))
298
+
299
+ return segments
300
+
301
+
302
+ class WriteLRC(SubtitlesWriter):
303
+ extension: str = "lrc"
304
+ always_include_hours: bool = False
305
+ decimal_marker: str = "."
306
+
307
+ def write_result(
308
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
309
+ ):
310
+ for i, (start, end, text) in enumerate(
311
+ self.iterate_result(result, options, **kwargs), start=1
312
+ ):
313
+ if "align_lrc_words" in kwargs and kwargs["align_lrc_words"]:
314
+ print(f"{text}\n", file=file, flush=True)
315
+ else:
316
+ print(f"[{start}]{text}[{end}]\n", file=file, flush=True)
317
+
318
+ def to_segments(self, file_path: str) -> List[Segment]:
319
+ segments = []
320
+
321
+ blocks = read_file(file_path).split('\n')
322
+
323
+ for block in blocks:
324
+ if block.strip() != '':
325
+ lines = block.strip()
326
+ pattern = r'(\[.*?\])'
327
+ parts = re.split(pattern, lines)
328
+ parts = [part.strip() for part in parts if part]
329
+
330
+ for i, part in enumerate(parts):
331
+ sentence_i = i%2
332
+ if sentence_i == 1:
333
+ start_str, text, end_str = parts[sentence_i-1], parts[sentence_i], parts[sentence_i+1]
334
+ start_str, end_str = start_str.replace("[", "").replace("]", ""), end_str.replace("[", "").replace("]", "")
335
+ start, end = time_str_to_seconds(start_str, self.decimal_marker), time_str_to_seconds(end_str, self.decimal_marker)
336
+
337
+ segments.append(Segment(
338
+ start=start,
339
+ end=end,
340
+ text=text,
341
+ ))
342
+
343
+ return segments
344
+
345
+
346
+ class WriteTSV(ResultWriter):
347
+ """
348
+ Write a transcript to a file in TSV (tab-separated values) format containing lines like:
349
+ <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
350
+
351
+ Using integer milliseconds as start and end times means there's no chance of interference from
352
+ an environment setting a language encoding that causes the decimal in a floating point number
353
+ to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
354
+ """
355
+
356
+ extension: str = "tsv"
357
+
358
+ def write_result(
359
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
360
+ ):
361
+ print("start", "end", "text", sep="\t", file=file)
362
+ for segment in result["segments"]:
363
+ print(round(1000 * segment["start"]), file=file, end="\t")
364
+ print(round(1000 * segment["end"]), file=file, end="\t")
365
+ print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
366
+
367
+
368
+ class WriteJSON(ResultWriter):
369
+ extension: str = "json"
370
+
371
+ def write_result(
372
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
373
+ ):
374
+ json.dump(result, file)
375
+
376
+
377
+ def get_writer(
378
+ output_format: str, output_dir: str
379
+ ) -> Callable[[dict, TextIO, dict], None]:
380
+ output_format = output_format.strip().lower().replace(".", "")
381
+
382
+ writers = {
383
+ "txt": WriteTXT,
384
+ "vtt": WriteVTT,
385
+ "srt": WriteSRT,
386
+ "tsv": WriteTSV,
387
+ "json": WriteJSON,
388
+ "lrc": WriteLRC
389
+ }
390
+
391
+ if output_format == "all":
392
+ all_writers = [writer(output_dir) for writer in writers.values()]
393
+
394
+ def write_all(
395
+ result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
396
+ ):
397
+ for writer in all_writers:
398
+ writer(result, file, options, **kwargs)
399
+
400
+ return write_all
401
+
402
+ return writers[output_format](output_dir)
403
+
404
+
405
+ def generate_file(
406
+ output_format: str, output_dir: str, result: Union[dict, List[Segment]], output_file_name: str,
407
+ add_timestamp: bool = True, **kwargs
408
+ ) -> Tuple[str, str]:
409
+ output_format = output_format.strip().lower().replace(".", "")
410
+ output_format = "vtt" if output_format == "webvtt" else output_format
411
+
412
+ if add_timestamp:
413
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
414
+ output_file_name += f"-{timestamp}"
415
+
416
+ file_path = os.path.join(output_dir, f"{output_file_name}.{output_format}")
417
+ file_writer = get_writer(output_format=output_format, output_dir=output_dir)
418
+
419
+ if isinstance(file_writer, WriteLRC) and kwargs.get("highlight_words", False):
420
+ kwargs["highlight_words"], kwargs["align_lrc_words"] = False, True
421
+
422
+ file_writer(result=result, output_file_name=output_file_name, **kwargs)
423
+ content = read_file(file_path)
424
+ return content, file_path
425
 
426
  @spaces.GPU(duration=120)
427
  def safe_filename(name):
modules/vad/silero_vad.py CHANGED
@@ -5,7 +5,8 @@ import numpy as np
5
  from typing import BinaryIO, Union, List, Optional, Tuple
6
  import warnings
7
  import faster_whisper
8
- from faster_whisper.transcribe import SpeechTimestampsMap, Segment
 
9
  import gradio as gr
10
 
11
 
@@ -247,18 +248,18 @@ class SileroVAD:
247
 
248
  def restore_speech_timestamps(
249
  self,
250
- segments: List[dict],
251
  speech_chunks: List[dict],
252
  sampling_rate: Optional[int] = None,
253
- ) -> List[dict]:
254
  if sampling_rate is None:
255
  sampling_rate = self.sampling_rate
256
 
257
  ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
258
 
259
  for segment in segments:
260
- segment["start"] = ts_map.get_original_time(segment["start"])
261
- segment["end"] = ts_map.get_original_time(segment["end"])
262
 
263
  return segments
264
 
 
5
  from typing import BinaryIO, Union, List, Optional, Tuple
6
  import warnings
7
  import faster_whisper
8
+ from modules.whisper.data_classes import *
9
+ from faster_whisper.transcribe import SpeechTimestampsMap
10
  import gradio as gr
11
 
12
 
 
248
 
249
  def restore_speech_timestamps(
250
  self,
251
+ segments: List[Segment],
252
  speech_chunks: List[dict],
253
  sampling_rate: Optional[int] = None,
254
+ ) -> List[Segment]:
255
  if sampling_rate is None:
256
  sampling_rate = self.sampling_rate
257
 
258
  ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
259
 
260
  for segment in segments:
261
+ segment.start = ts_map.get_original_time(segment.start)
262
+ segment.end = ts_map.get_original_time(segment.end)
263
 
264
  return segments
265
 
modules/whisper/{whisper_base.py → base_transcription_pipeline.py} RENAMED
@@ -1,6 +1,6 @@
1
  import os
2
- import torch
3
  import whisper
 
4
  import gradio as gr
5
  import torchaudio
6
  from abc import ABC, abstractmethod
@@ -8,20 +8,20 @@ from typing import BinaryIO, Union, Tuple, List
8
  import numpy as np
9
  from datetime import datetime
10
  from faster_whisper.vad import VadOptions
11
- from dataclasses import astuple
12
 
13
  from modules.uvr.music_separator import MusicSeparator
14
  from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
15
  UVR_MODELS_DIR)
16
- from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 
17
  from modules.utils.youtube_manager import get_ytdata, get_ytaudio
18
- from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
19
- from modules.whisper.whisper_parameter import *
20
  from modules.diarize.diarizer import Diarizer
21
  from modules.vad.silero_vad import SileroVAD
22
 
23
 
24
- class WhisperBase(ABC):
25
  def __init__(self,
26
  model_dir: str = WHISPER_MODELS_DIR,
27
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
@@ -47,8 +47,8 @@ class WhisperBase(ABC):
47
  self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
48
  self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
49
  self.device = self.get_device()
50
- self.available_compute_types = ["float16", "float32"]
51
- self.current_compute_type = "float16" if self.device == "cuda" else "float32"
52
 
53
  @abstractmethod
54
  def transcribe(self,
@@ -71,13 +71,15 @@ class WhisperBase(ABC):
71
  def run(self,
72
  audio: Union[str, BinaryIO, np.ndarray],
73
  progress: gr.Progress = gr.Progress(),
 
74
  add_timestamp: bool = True,
75
- *whisper_params,
76
- ) -> Tuple[List[dict], float]:
77
  """
78
  Run transcription with conditional pre-processing and post-processing.
79
  The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
80
  The diarization will be performed in post-processing, if enabled.
 
81
 
82
  Parameters
83
  ----------
@@ -85,40 +87,33 @@ class WhisperBase(ABC):
85
  Audio input. This can be file path or binary type.
86
  progress: gr.Progress
87
  Indicator to show progress directly in gradio.
 
 
88
  add_timestamp: bool
89
  Whether to add a timestamp at the end of the filename.
90
- *whisper_params: tuple
91
- Parameters related with whisper. This will be dealt with "WhisperParameters" data class
 
 
92
 
93
  Returns
94
  ----------
95
- segments_result: List[dict]
96
- list of dicts that includes start, end timestamps and transcribed text
97
  elapsed_time: float
98
  elapsed time for running
99
  """
100
- params = WhisperParameters.as_value(*whisper_params)
 
 
101
 
102
- self.cache_parameters(
103
- whisper_params=params,
104
- add_timestamp=add_timestamp
105
- )
106
-
107
- if params.lang is None:
108
- pass
109
- elif params.lang == "Automatic Detection":
110
- params.lang = None
111
- else:
112
- language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
113
- params.lang = language_code_dict[params.lang]
114
-
115
- if params.is_bgm_separate:
116
  music, audio, _ = self.music_separator.separate(
117
  audio=audio,
118
- model_name=params.uvr_model_size,
119
- device=params.uvr_device,
120
- segment_size=params.uvr_segment_size,
121
- save_file=params.uvr_save_file,
122
  progress=progress
123
  )
124
 
@@ -130,47 +125,55 @@ class WhisperBase(ABC):
130
  origin_sample_rate = self.music_separator.audio_info.sample_rate
131
  audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
132
 
133
- if params.uvr_enable_offload:
134
  self.music_separator.offload()
135
 
136
- if params.vad_filter:
137
- # Explicit value set for float('inf') from gr.Number()
138
- if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
139
- params.max_speech_duration_s = float('inf')
140
-
141
  vad_options = VadOptions(
142
- threshold=params.threshold,
143
- min_speech_duration_ms=params.min_speech_duration_ms,
144
- max_speech_duration_s=params.max_speech_duration_s,
145
- min_silence_duration_ms=params.min_silence_duration_ms,
146
- speech_pad_ms=params.speech_pad_ms
147
  )
148
 
149
- audio, speech_chunks = self.vad.run(
150
  audio=audio,
151
  vad_parameters=vad_options,
152
  progress=progress
153
  )
154
 
 
 
 
 
 
155
  result, elapsed_time = self.transcribe(
156
  audio,
157
  progress,
158
- *astuple(params)
159
  )
160
 
161
- if params.vad_filter:
162
  result = self.vad.restore_speech_timestamps(
163
  segments=result,
164
  speech_chunks=speech_chunks,
165
  )
166
 
167
- if params.is_diarize:
168
  result, elapsed_time_diarization = self.diarizer.run(
169
  audio=audio,
170
- use_auth_token=params.hf_token,
171
  transcribed_result=result,
 
172
  )
173
  elapsed_time += elapsed_time_diarization
 
 
 
 
 
 
174
  return result, elapsed_time
175
 
176
  def transcribe_file(self,
@@ -179,8 +182,8 @@ class WhisperBase(ABC):
179
  file_format: str = "SRT",
180
  add_timestamp: bool = True,
181
  progress=gr.Progress(),
182
- *whisper_params,
183
- ) -> list:
184
  """
185
  Write subtitle file from Files
186
 
@@ -197,8 +200,8 @@ class WhisperBase(ABC):
197
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
198
  progress: gr.Progress
199
  Indicator to show progress directly in gradio.
200
- *whisper_params: tuple
201
- Parameters related with whisper. This will be dealt with "WhisperParameters" data class
202
 
203
  Returns
204
  ----------
@@ -208,6 +211,11 @@ class WhisperBase(ABC):
208
  Output file path to return to gr.Files()
209
  """
210
  try:
 
 
 
 
 
211
  if input_folder_path:
212
  files = get_media_files(input_folder_path)
213
  if isinstance(files, str):
@@ -220,19 +228,21 @@ class WhisperBase(ABC):
220
  transcribed_segments, time_for_task = self.run(
221
  file,
222
  progress,
 
223
  add_timestamp,
224
- *whisper_params,
225
  )
226
 
227
  file_name, file_ext = os.path.splitext(os.path.basename(file))
228
- subtitle, file_path = self.generate_and_write_file(
229
- file_name=file_name,
230
- transcribed_segments=transcribed_segments,
 
 
231
  add_timestamp=add_timestamp,
232
- file_format=file_format,
233
- output_dir=self.output_dir
234
  )
235
- files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
236
 
237
  total_result = ''
238
  total_time = 0
@@ -245,10 +255,11 @@ class WhisperBase(ABC):
245
  result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
246
  result_file_path = [info['path'] for info in files_info.values()]
247
 
248
- return [result_str, result_file_path]
249
 
250
  except Exception as e:
251
  print(f"Error transcribing file: {e}")
 
252
  finally:
253
  self.release_cuda_memory()
254
 
@@ -257,8 +268,8 @@ class WhisperBase(ABC):
257
  file_format: str = "SRT",
258
  add_timestamp: bool = True,
259
  progress=gr.Progress(),
260
- *whisper_params,
261
- ) -> list:
262
  """
263
  Write subtitle file from microphone
264
 
@@ -272,7 +283,7 @@ class WhisperBase(ABC):
272
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
273
  progress: gr.Progress
274
  Indicator to show progress directly in gradio.
275
- *whisper_params: tuple
276
  Parameters related with whisper. This will be dealt with "WhisperParameters" data class
277
 
278
  Returns
@@ -283,27 +294,36 @@ class WhisperBase(ABC):
283
  Output file path to return to gr.Files()
284
  """
285
  try:
 
 
 
 
 
286
  progress(0, desc="Loading Audio..")
287
  transcribed_segments, time_for_task = self.run(
288
  mic_audio,
289
  progress,
 
290
  add_timestamp,
291
- *whisper_params,
292
  )
293
  progress(1, desc="Completed!")
294
 
295
- subtitle, result_file_path = self.generate_and_write_file(
296
- file_name="Mic",
297
- transcribed_segments=transcribed_segments,
 
 
 
298
  add_timestamp=add_timestamp,
299
- file_format=file_format,
300
- output_dir=self.output_dir
301
  )
302
 
303
  result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
304
- return [result_str, result_file_path]
305
  except Exception as e:
306
- print(f"Error transcribing file: {e}")
 
307
  finally:
308
  self.release_cuda_memory()
309
 
@@ -312,8 +332,8 @@ class WhisperBase(ABC):
312
  file_format: str = "SRT",
313
  add_timestamp: bool = True,
314
  progress=gr.Progress(),
315
- *whisper_params,
316
- ) -> list:
317
  """
318
  Write subtitle file from Youtube
319
 
@@ -327,7 +347,7 @@ class WhisperBase(ABC):
327
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
328
  progress: gr.Progress
329
  Indicator to show progress directly in gradio.
330
- *whisper_params: tuple
331
  Parameters related with whisper. This will be dealt with "WhisperParameters" data class
332
 
333
  Returns
@@ -338,6 +358,11 @@ class WhisperBase(ABC):
338
  Output file path to return to gr.Files()
339
  """
340
  try:
 
 
 
 
 
341
  progress(0, desc="Loading Audio from Youtube..")
342
  yt = get_ytdata(youtube_link)
343
  audio = get_ytaudio(yt)
@@ -345,83 +370,49 @@ class WhisperBase(ABC):
345
  transcribed_segments, time_for_task = self.run(
346
  audio,
347
  progress,
 
348
  add_timestamp,
349
- *whisper_params,
350
  )
351
 
352
  progress(1, desc="Completed!")
353
 
354
  file_name = safe_filename(yt.title)
355
- subtitle, result_file_path = self.generate_and_write_file(
356
- file_name=file_name,
357
- transcribed_segments=transcribed_segments,
 
 
358
  add_timestamp=add_timestamp,
359
- file_format=file_format,
360
- output_dir=self.output_dir
361
  )
 
362
  result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
363
 
364
  if os.path.exists(audio):
365
  os.remove(audio)
366
 
367
- return [result_str, result_file_path]
368
 
369
  except Exception as e:
370
- print(f"Error transcribing file: {e}")
 
371
  finally:
372
  self.release_cuda_memory()
373
 
374
- @staticmethod
375
- def generate_and_write_file(file_name: str,
376
- transcribed_segments: list,
377
- add_timestamp: bool,
378
- file_format: str,
379
- output_dir: str
380
- ) -> str:
381
- """
382
- Writes subtitle file
383
-
384
- Parameters
385
- ----------
386
- file_name: str
387
- Output file name
388
- transcribed_segments: list
389
- Text segments transcribed from audio
390
- add_timestamp: bool
391
- Determines whether to add a timestamp to the end of the filename.
392
- file_format: str
393
- File format to write. Supported formats: [SRT, WebVTT, txt]
394
- output_dir: str
395
- Directory path of the output
396
-
397
- Returns
398
- ----------
399
- content: str
400
- Result of the transcription
401
- output_path: str
402
- output file path
403
- """
404
- if add_timestamp:
405
- timestamp = datetime.now().strftime("%m%d%H%M%S")
406
- output_path = os.path.join(output_dir, f"{file_name}-{timestamp}")
407
  else:
408
- output_path = os.path.join(output_dir, f"{file_name}")
409
-
410
- file_format = file_format.strip().lower()
411
- if file_format == "srt":
412
- content = get_srt(transcribed_segments)
413
- output_path += '.srt'
414
-
415
- elif file_format == "webvtt":
416
- content = get_vtt(transcribed_segments)
417
- output_path += '.vtt'
418
 
419
- elif file_format == "txt":
420
- content = get_txt(transcribed_segments)
421
- output_path += '.txt'
422
-
423
- write_file(content, output_path)
424
- return content, output_path
425
 
426
  @staticmethod
427
  def format_time(elapsed_time: float) -> str:
@@ -455,7 +446,7 @@ class WhisperBase(ABC):
455
  if torch.cuda.is_available():
456
  return "cuda"
457
  elif torch.backends.mps.is_available():
458
- if not WhisperBase.is_sparse_api_supported():
459
  # Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
460
  return "cpu"
461
  return "mps"
@@ -496,18 +487,65 @@ class WhisperBase(ABC):
496
  if file_path and os.path.exists(file_path):
497
  os.remove(file_path)
498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  @staticmethod
500
  def cache_parameters(
501
- whisper_params: WhisperValues,
502
- add_timestamp: bool
 
503
  ):
504
- """cache parameters to the yaml file"""
505
  cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
506
- cached_whisper_param = whisper_params.to_yaml()
507
- cached_yaml = {**cached_params, **cached_whisper_param}
 
508
  cached_yaml["whisper"]["add_timestamp"] = add_timestamp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
 
510
- save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
 
511
 
512
  @staticmethod
513
  def resample_audio(audio: Union[str, np.ndarray],
 
1
  import os
 
2
  import whisper
3
+ import ctranslate2
4
  import gradio as gr
5
  import torchaudio
6
  from abc import ABC, abstractmethod
 
8
  import numpy as np
9
  from datetime import datetime
10
  from faster_whisper.vad import VadOptions
 
11
 
12
  from modules.uvr.music_separator import MusicSeparator
13
  from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
14
  UVR_MODELS_DIR)
15
+ from modules.utils.constants import *
16
+ from modules.utils.subtitle_manager import *
17
  from modules.utils.youtube_manager import get_ytdata, get_ytaudio
18
+ from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml, read_file
19
+ from modules.whisper.data_classes import *
20
  from modules.diarize.diarizer import Diarizer
21
  from modules.vad.silero_vad import SileroVAD
22
 
23
 
24
+ class BaseTranscriptionPipeline(ABC):
25
  def __init__(self,
26
  model_dir: str = WHISPER_MODELS_DIR,
27
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
47
  self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
48
  self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
49
  self.device = self.get_device()
50
+ self.available_compute_types = self.get_available_compute_type()
51
+ self.current_compute_type = self.get_compute_type()
52
 
53
  @abstractmethod
54
  def transcribe(self,
 
71
  def run(self,
72
  audio: Union[str, BinaryIO, np.ndarray],
73
  progress: gr.Progress = gr.Progress(),
74
+ file_format: str = "SRT",
75
  add_timestamp: bool = True,
76
+ *pipeline_params,
77
+ ) -> Tuple[List[Segment], float]:
78
  """
79
  Run transcription with conditional pre-processing and post-processing.
80
  The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
81
  The diarization will be performed in post-processing, if enabled.
82
+ Due to the integration with gradio, the parameters have to be specified with a `*` wildcard.
83
 
84
  Parameters
85
  ----------
 
87
  Audio input. This can be file path or binary type.
88
  progress: gr.Progress
89
  Indicator to show progress directly in gradio.
90
+ file_format: str
91
+ Subtitle file format between ["SRT", "WebVTT", "txt", "lrc"]
92
  add_timestamp: bool
93
  Whether to add a timestamp at the end of the filename.
94
+ *pipeline_params: tuple
95
+ Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class.
96
+ This must be provided as a List with * wildcard because of the integration with gradio.
97
+ See more info at : https://github.com/gradio-app/gradio/issues/2471
98
 
99
  Returns
100
  ----------
101
+ segments_result: List[Segment]
102
+ list of Segment that includes start, end timestamps and transcribed text
103
  elapsed_time: float
104
  elapsed time for running
105
  """
106
+ params = TranscriptionPipelineParams.from_list(list(pipeline_params))
107
+ params = self.validate_gradio_values(params)
108
+ bgm_params, vad_params, whisper_params, diarization_params = params.bgm_separation, params.vad, params.whisper, params.diarization
109
 
110
+ if bgm_params.is_separate_bgm:
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  music, audio, _ = self.music_separator.separate(
112
  audio=audio,
113
+ model_name=bgm_params.model_size,
114
+ device=bgm_params.device,
115
+ segment_size=bgm_params.segment_size,
116
+ save_file=bgm_params.save_file,
117
  progress=progress
118
  )
119
 
 
125
  origin_sample_rate = self.music_separator.audio_info.sample_rate
126
  audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
127
 
128
+ if bgm_params.enable_offload:
129
  self.music_separator.offload()
130
 
131
+ if vad_params.vad_filter:
 
 
 
 
132
  vad_options = VadOptions(
133
+ threshold=vad_params.threshold,
134
+ min_speech_duration_ms=vad_params.min_speech_duration_ms,
135
+ max_speech_duration_s=vad_params.max_speech_duration_s,
136
+ min_silence_duration_ms=vad_params.min_silence_duration_ms,
137
+ speech_pad_ms=vad_params.speech_pad_ms
138
  )
139
 
140
+ vad_processed, speech_chunks = self.vad.run(
141
  audio=audio,
142
  vad_parameters=vad_options,
143
  progress=progress
144
  )
145
 
146
+ if vad_processed.size > 0:
147
+ audio = vad_processed
148
+ else:
149
+ vad_params.vad_filter = False
150
+
151
  result, elapsed_time = self.transcribe(
152
  audio,
153
  progress,
154
+ *whisper_params.to_list()
155
  )
156
 
157
+ if vad_params.vad_filter:
158
  result = self.vad.restore_speech_timestamps(
159
  segments=result,
160
  speech_chunks=speech_chunks,
161
  )
162
 
163
+ if diarization_params.is_diarize:
164
  result, elapsed_time_diarization = self.diarizer.run(
165
  audio=audio,
166
+ use_auth_token=diarization_params.hf_token,
167
  transcribed_result=result,
168
+ device=diarization_params.device
169
  )
170
  elapsed_time += elapsed_time_diarization
171
+
172
+ self.cache_parameters(
173
+ params=params,
174
+ file_format=file_format,
175
+ add_timestamp=add_timestamp
176
+ )
177
  return result, elapsed_time
178
 
179
  def transcribe_file(self,
 
182
  file_format: str = "SRT",
183
  add_timestamp: bool = True,
184
  progress=gr.Progress(),
185
+ *pipeline_params,
186
+ ) -> Tuple[str, List]:
187
  """
188
  Write subtitle file from Files
189
 
 
200
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
201
  progress: gr.Progress
202
  Indicator to show progress directly in gradio.
203
+ *pipeline_params: tuple
204
+ Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class
205
 
206
  Returns
207
  ----------
 
211
  Output file path to return to gr.Files()
212
  """
213
  try:
214
+ params = TranscriptionPipelineParams.from_list(list(pipeline_params))
215
+ writer_options = {
216
+ "highlight_words": True if params.whisper.word_timestamps else False
217
+ }
218
+
219
  if input_folder_path:
220
  files = get_media_files(input_folder_path)
221
  if isinstance(files, str):
 
228
  transcribed_segments, time_for_task = self.run(
229
  file,
230
  progress,
231
+ file_format,
232
  add_timestamp,
233
+ *pipeline_params,
234
  )
235
 
236
  file_name, file_ext = os.path.splitext(os.path.basename(file))
237
+ subtitle, file_path = generate_file(
238
+ output_dir=self.output_dir,
239
+ output_file_name=file_name,
240
+ output_format=file_format,
241
+ result=transcribed_segments,
242
  add_timestamp=add_timestamp,
243
+ **writer_options
 
244
  )
245
+ files_info[file_name] = {"subtitle": read_file(file_path), "time_for_task": time_for_task, "path": file_path}
246
 
247
  total_result = ''
248
  total_time = 0
 
255
  result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
256
  result_file_path = [info['path'] for info in files_info.values()]
257
 
258
+ return result_str, result_file_path
259
 
260
  except Exception as e:
261
  print(f"Error transcribing file: {e}")
262
+ raise
263
  finally:
264
  self.release_cuda_memory()
265
 
 
268
  file_format: str = "SRT",
269
  add_timestamp: bool = True,
270
  progress=gr.Progress(),
271
+ *pipeline_params,
272
+ ) -> Tuple[str, str]:
273
  """
274
  Write subtitle file from microphone
275
 
 
283
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
284
  progress: gr.Progress
285
  Indicator to show progress directly in gradio.
286
+ *pipeline_params: tuple
287
  Parameters related with whisper. This will be dealt with "WhisperParameters" data class
288
 
289
  Returns
 
294
  Output file path to return to gr.Files()
295
  """
296
  try:
297
+ params = TranscriptionPipelineParams.from_list(list(pipeline_params))
298
+ writer_options = {
299
+ "highlight_words": True if params.whisper.word_timestamps else False
300
+ }
301
+
302
  progress(0, desc="Loading Audio..")
303
  transcribed_segments, time_for_task = self.run(
304
  mic_audio,
305
  progress,
306
+ file_format,
307
  add_timestamp,
308
+ *pipeline_params,
309
  )
310
  progress(1, desc="Completed!")
311
 
312
+ file_name = "Mic"
313
+ subtitle, file_path = generate_file(
314
+ output_dir=self.output_dir,
315
+ output_file_name=file_name,
316
+ output_format=file_format,
317
+ result=transcribed_segments,
318
  add_timestamp=add_timestamp,
319
+ **writer_options
 
320
  )
321
 
322
  result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
323
+ return result_str, file_path
324
  except Exception as e:
325
+ print(f"Error transcribing mic: {e}")
326
+ raise
327
  finally:
328
  self.release_cuda_memory()
329
 
 
332
  file_format: str = "SRT",
333
  add_timestamp: bool = True,
334
  progress=gr.Progress(),
335
+ *pipeline_params,
336
+ ) -> Tuple[str, str]:
337
  """
338
  Write subtitle file from Youtube
339
 
 
347
  Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
348
  progress: gr.Progress
349
  Indicator to show progress directly in gradio.
350
+ *pipeline_params: tuple
351
  Parameters related with whisper. This will be dealt with "WhisperParameters" data class
352
 
353
  Returns
 
358
  Output file path to return to gr.Files()
359
  """
360
  try:
361
+ params = TranscriptionPipelineParams.from_list(list(pipeline_params))
362
+ writer_options = {
363
+ "highlight_words": True if params.whisper.word_timestamps else False
364
+ }
365
+
366
  progress(0, desc="Loading Audio from Youtube..")
367
  yt = get_ytdata(youtube_link)
368
  audio = get_ytaudio(yt)
 
370
  transcribed_segments, time_for_task = self.run(
371
  audio,
372
  progress,
373
+ file_format,
374
  add_timestamp,
375
+ *pipeline_params,
376
  )
377
 
378
  progress(1, desc="Completed!")
379
 
380
  file_name = safe_filename(yt.title)
381
+ subtitle, file_path = generate_file(
382
+ output_dir=self.output_dir,
383
+ output_file_name=file_name,
384
+ output_format=file_format,
385
+ result=transcribed_segments,
386
  add_timestamp=add_timestamp,
387
+ **writer_options
 
388
  )
389
+
390
  result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
391
 
392
  if os.path.exists(audio):
393
  os.remove(audio)
394
 
395
+ return result_str, file_path
396
 
397
  except Exception as e:
398
+ print(f"Error transcribing youtube: {e}")
399
+ raise
400
  finally:
401
  self.release_cuda_memory()
402
 
403
+ def get_compute_type(self):
404
+ if "float16" in self.available_compute_types:
405
+ return "float16"
406
+ if "float32" in self.available_compute_types:
407
+ return "float32"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  else:
409
+ return self.available_compute_types[0]
 
 
 
 
 
 
 
 
 
410
 
411
+ def get_available_compute_type(self):
412
+ if self.device == "cuda":
413
+ return list(ctranslate2.get_supported_compute_types("cuda"))
414
+ else:
415
+ return list(ctranslate2.get_supported_compute_types("cpu"))
 
416
 
417
  @staticmethod
418
  def format_time(elapsed_time: float) -> str:
 
446
  if torch.cuda.is_available():
447
  return "cuda"
448
  elif torch.backends.mps.is_available():
449
+ if not BaseTranscriptionPipeline.is_sparse_api_supported():
450
  # Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
451
  return "cpu"
452
  return "mps"
 
487
  if file_path and os.path.exists(file_path):
488
  os.remove(file_path)
489
 
490
+ @staticmethod
491
+ def validate_gradio_values(params: TranscriptionPipelineParams):
492
+ """
493
+ Validate gradio specific values that can't be displayed as None in the UI.
494
+ Related issue : https://github.com/gradio-app/gradio/issues/8723
495
+ """
496
+ if params.whisper.lang is None:
497
+ pass
498
+ elif params.whisper.lang == AUTOMATIC_DETECTION:
499
+ params.whisper.lang = None
500
+ else:
501
+ language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
502
+ params.whisper.lang = language_code_dict[params.whisper.lang]
503
+
504
+ if params.whisper.initial_prompt == GRADIO_NONE_STR:
505
+ params.whisper.initial_prompt = None
506
+ if params.whisper.prefix == GRADIO_NONE_STR:
507
+ params.whisper.prefix = None
508
+ if params.whisper.hotwords == GRADIO_NONE_STR:
509
+ params.whisper.hotwords = None
510
+ if params.whisper.max_new_tokens == GRADIO_NONE_NUMBER_MIN:
511
+ params.whisper.max_new_tokens = None
512
+ if params.whisper.hallucination_silence_threshold == GRADIO_NONE_NUMBER_MIN:
513
+ params.whisper.hallucination_silence_threshold = None
514
+ if params.whisper.language_detection_threshold == GRADIO_NONE_NUMBER_MIN:
515
+ params.whisper.language_detection_threshold = None
516
+ if params.vad.max_speech_duration_s == GRADIO_NONE_NUMBER_MAX:
517
+ params.vad.max_speech_duration_s = float('inf')
518
+ return params
519
+
520
  @staticmethod
521
  def cache_parameters(
522
+ params: TranscriptionPipelineParams,
523
+ file_format: str = "SRT",
524
+ add_timestamp: bool = True
525
  ):
526
+ """Cache parameters to the yaml file"""
527
  cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
528
+ param_to_cache = params.to_dict()
529
+
530
+ cached_yaml = {**cached_params, **param_to_cache}
531
  cached_yaml["whisper"]["add_timestamp"] = add_timestamp
532
+ cached_yaml["whisper"]["file_format"] = file_format
533
+
534
+ supress_token = cached_yaml["whisper"].get("suppress_tokens", None)
535
+ if supress_token and isinstance(supress_token, list):
536
+ cached_yaml["whisper"]["suppress_tokens"] = str(supress_token)
537
+
538
+ if cached_yaml["whisper"].get("lang", None) is None:
539
+ cached_yaml["whisper"]["lang"] = AUTOMATIC_DETECTION.unwrap()
540
+ else:
541
+ language_dict = whisper.tokenizer.LANGUAGES
542
+ cached_yaml["whisper"]["lang"] = language_dict[cached_yaml["whisper"]["lang"]]
543
+
544
+ if cached_yaml["vad"].get("max_speech_duration_s", float('inf')) == float('inf'):
545
+ cached_yaml["vad"]["max_speech_duration_s"] = GRADIO_NONE_NUMBER_MAX
546
 
547
+ if cached_yaml is not None and cached_yaml:
548
+ save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
549
 
550
  @staticmethod
551
  def resample_audio(audio: Union[str, np.ndarray],
modules/whisper/data_classes.py ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faster_whisper.transcribe
2
+ import gradio as gr
3
+ import torch
4
+ from typing import Optional, Dict, List, Union, NamedTuple
5
+ from pydantic import BaseModel, Field, field_validator, ConfigDict
6
+ from gradio_i18n import Translate, gettext as _
7
+ from enum import Enum
8
+ from copy import deepcopy
9
+
10
+ import yaml
11
+
12
+ from modules.utils.constants import *
13
+
14
+
15
+ class WhisperImpl(Enum):
16
+ WHISPER = "whisper"
17
+ FASTER_WHISPER = "faster-whisper"
18
+ INSANELY_FAST_WHISPER = "insanely_fast_whisper"
19
+
20
+
21
+ class Segment(BaseModel):
22
+ id: Optional[int] = Field(default=None, description="Incremental id for the segment")
23
+ seek: Optional[int] = Field(default=None, description="Seek of the segment from chunked audio")
24
+ text: Optional[str] = Field(default=None, description="Transcription text of the segment")
25
+ start: Optional[float] = Field(default=None, description="Start time of the segment")
26
+ end: Optional[float] = Field(default=None, description="End time of the segment")
27
+ tokens: Optional[List[int]] = Field(default=None, description="List of token IDs")
28
+ temperature: Optional[float] = Field(default=None, description="Temperature used during the decoding process")
29
+ avg_logprob: Optional[float] = Field(default=None, description="Average log probability of the tokens")
30
+ compression_ratio: Optional[float] = Field(default=None, description="Compression ratio of the segment")
31
+ no_speech_prob: Optional[float] = Field(default=None, description="Probability that it's not speech")
32
+ words: Optional[List['Word']] = Field(default=None, description="List of words contained in the segment")
33
+
34
+ @classmethod
35
+ def from_faster_whisper(cls,
36
+ seg: faster_whisper.transcribe.Segment):
37
+ if seg.words is not None:
38
+ words = [
39
+ Word(
40
+ start=w.start,
41
+ end=w.end,
42
+ word=w.word,
43
+ probability=w.probability
44
+ ) for w in seg.words
45
+ ]
46
+ else:
47
+ words = None
48
+
49
+ return cls(
50
+ id=seg.id,
51
+ seek=seg.seek,
52
+ text=seg.text,
53
+ start=seg.start,
54
+ end=seg.end,
55
+ tokens=seg.tokens,
56
+ temperature=seg.temperature,
57
+ avg_logprob=seg.avg_logprob,
58
+ compression_ratio=seg.compression_ratio,
59
+ no_speech_prob=seg.no_speech_prob,
60
+ words=words
61
+ )
62
+
63
+
64
+ class Word(BaseModel):
65
+ start: Optional[float] = Field(default=None, description="Start time of the word")
66
+ end: Optional[float] = Field(default=None, description="Start time of the word")
67
+ word: Optional[str] = Field(default=None, description="Word text")
68
+ probability: Optional[float] = Field(default=None, description="Probability of the word")
69
+
70
+
71
+ class BaseParams(BaseModel):
72
+ model_config = ConfigDict(protected_namespaces=())
73
+
74
+ def to_dict(self) -> Dict:
75
+ return self.model_dump()
76
+
77
+ def to_list(self) -> List:
78
+ return list(self.model_dump().values())
79
+
80
+ @classmethod
81
+ def from_list(cls, data_list: List) -> 'BaseParams':
82
+ field_names = list(cls.model_fields.keys())
83
+ return cls(**dict(zip(field_names, data_list)))
84
+
85
+
86
+ class VadParams(BaseParams):
87
+ """Voice Activity Detection parameters"""
88
+ vad_filter: bool = Field(default=False, description="Enable voice activity detection to filter out non-speech parts")
89
+ threshold: float = Field(
90
+ default=0.5,
91
+ ge=0.0,
92
+ le=1.0,
93
+ description="Speech threshold for Silero VAD. Probabilities above this value are considered speech"
94
+ )
95
+ min_speech_duration_ms: int = Field(
96
+ default=250,
97
+ ge=0,
98
+ description="Final speech chunks shorter than this are discarded"
99
+ )
100
+ max_speech_duration_s: float = Field(
101
+ default=float("inf"),
102
+ gt=0,
103
+ description="Maximum duration of speech chunks in seconds"
104
+ )
105
+ min_silence_duration_ms: int = Field(
106
+ default=2000,
107
+ ge=0,
108
+ description="Minimum silence duration between speech chunks"
109
+ )
110
+ speech_pad_ms: int = Field(
111
+ default=400,
112
+ ge=0,
113
+ description="Padding added to each side of speech chunks"
114
+ )
115
+
116
+ @classmethod
117
+ def to_gradio_inputs(cls, defaults: Optional[Dict] = None) -> List[gr.components.base.FormComponent]:
118
+ return [
119
+ gr.Checkbox(
120
+ label=_("Enable Silero VAD Filter"),
121
+ value=defaults.get("vad_filter", cls.__fields__["vad_filter"].default),
122
+ interactive=True,
123
+ info=_("Enable this to transcribe only detected voice")
124
+ ),
125
+ gr.Slider(
126
+ minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
127
+ value=defaults.get("threshold", cls.__fields__["threshold"].default),
128
+ info="Lower it to be more sensitive to small sounds."
129
+ ),
130
+ gr.Number(
131
+ label="Minimum Speech Duration (ms)", precision=0,
132
+ value=defaults.get("min_speech_duration_ms", cls.__fields__["min_speech_duration_ms"].default),
133
+ info="Final speech chunks shorter than this time are thrown out"
134
+ ),
135
+ gr.Number(
136
+ label="Maximum Speech Duration (s)",
137
+ value=defaults.get("max_speech_duration_s", GRADIO_NONE_NUMBER_MAX),
138
+ info="Maximum duration of speech chunks in \"seconds\"."
139
+ ),
140
+ gr.Number(
141
+ label="Minimum Silence Duration (ms)", precision=0,
142
+ value=defaults.get("min_silence_duration_ms", cls.__fields__["min_silence_duration_ms"].default),
143
+ info="In the end of each speech chunk wait for this time before separating it"
144
+ ),
145
+ gr.Number(
146
+ label="Speech Padding (ms)", precision=0,
147
+ value=defaults.get("speech_pad_ms", cls.__fields__["speech_pad_ms"].default),
148
+ info="Final speech chunks are padded by this time each side"
149
+ )
150
+ ]
151
+
152
+
153
+ class DiarizationParams(BaseParams):
154
+ """Speaker diarization parameters"""
155
+ is_diarize: bool = Field(default=False, description="Enable speaker diarization")
156
+ device: str = Field(default="cuda", description="Device to run Diarization model.")
157
+ hf_token: str = Field(
158
+ default="",
159
+ description="Hugging Face token for downloading diarization models"
160
+ )
161
+
162
+ @classmethod
163
+ def to_gradio_inputs(cls,
164
+ defaults: Optional[Dict] = None,
165
+ available_devices: Optional[List] = None,
166
+ device: Optional[str] = None) -> List[gr.components.base.FormComponent]:
167
+ return [
168
+ gr.Checkbox(
169
+ label=_("Enable Diarization"),
170
+ value=defaults.get("is_diarize", cls.__fields__["is_diarize"].default),
171
+ ),
172
+ gr.Dropdown(
173
+ label=_("Device"),
174
+ choices=["cpu", "cuda"] if available_devices is None else available_devices,
175
+ value=defaults.get("device", device),
176
+ ),
177
+ gr.Textbox(
178
+ label=_("HuggingFace Token"),
179
+ value=defaults.get("hf_token", cls.__fields__["hf_token"].default),
180
+ info=_("This is only needed the first time you download the model")
181
+ ),
182
+ ]
183
+
184
+
185
+ class BGMSeparationParams(BaseParams):
186
+ """Background music separation parameters"""
187
+ is_separate_bgm: bool = Field(default=False, description="Enable background music separation")
188
+ model_size: str = Field(
189
+ default="UVR-MDX-NET-Inst_HQ_4",
190
+ description="UVR model size"
191
+ )
192
+ device: str = Field(default="cuda", description="Device to run UVR model.")
193
+ segment_size: int = Field(
194
+ default=256,
195
+ gt=0,
196
+ description="Segment size for UVR model"
197
+ )
198
+ save_file: bool = Field(
199
+ default=False,
200
+ description="Whether to save separated audio files"
201
+ )
202
+ enable_offload: bool = Field(
203
+ default=True,
204
+ description="Offload UVR model after transcription"
205
+ )
206
+
207
+ @classmethod
208
+ def to_gradio_input(cls,
209
+ defaults: Optional[Dict] = None,
210
+ available_devices: Optional[List] = None,
211
+ device: Optional[str] = None,
212
+ available_models: Optional[List] = None) -> List[gr.components.base.FormComponent]:
213
+ return [
214
+ gr.Checkbox(
215
+ label=_("Enable Background Music Remover Filter"),
216
+ value=defaults.get("is_separate_bgm", cls.__fields__["is_separate_bgm"].default),
217
+ interactive=True,
218
+ info=_("Enabling this will remove background music")
219
+ ),
220
+ gr.Dropdown(
221
+ label=_("Model"),
222
+ choices=["UVR-MDX-NET-Inst_HQ_4",
223
+ "UVR-MDX-NET-Inst_3"] if available_models is None else available_models,
224
+ value=defaults.get("model_size", cls.__fields__["model_size"].default),
225
+ ),
226
+ gr.Dropdown(
227
+ label=_("Device"),
228
+ choices=["cpu", "cuda"] if available_devices is None else available_devices,
229
+ value=defaults.get("device", device),
230
+ ),
231
+ gr.Number(
232
+ label="Segment Size",
233
+ value=defaults.get("segment_size", cls.__fields__["segment_size"].default),
234
+ precision=0,
235
+ info="Segment size for UVR model"
236
+ ),
237
+ gr.Checkbox(
238
+ label=_("Save separated files to output"),
239
+ value=defaults.get("save_file", cls.__fields__["save_file"].default),
240
+ ),
241
+ gr.Checkbox(
242
+ label=_("Offload sub model after removing background music"),
243
+ value=defaults.get("enable_offload", cls.__fields__["enable_offload"].default),
244
+ )
245
+ ]
246
+
247
+
248
+ class WhisperParams(BaseParams):
249
+ """Whisper parameters"""
250
+ model_size: str = Field(default="large-v2", description="Whisper model size")
251
+ lang: Optional[str] = Field(default=None, description="Source language of the file to transcribe")
252
+ is_translate: bool = Field(default=False, description="Translate speech to English end-to-end")
253
+ beam_size: int = Field(default=5, ge=1, description="Beam size for decoding")
254
+ log_prob_threshold: float = Field(
255
+ default=-1.0,
256
+ description="Threshold for average log probability of sampled tokens"
257
+ )
258
+ no_speech_threshold: float = Field(
259
+ default=0.6,
260
+ ge=0.0,
261
+ le=1.0,
262
+ description="Threshold for detecting silence"
263
+ )
264
+ compute_type: str = Field(default="float16", description="Computation type for transcription")
265
+ best_of: int = Field(default=5, ge=1, description="Number of candidates when sampling")
266
+ patience: float = Field(default=1.0, gt=0, description="Beam search patience factor")
267
+ condition_on_previous_text: bool = Field(
268
+ default=True,
269
+ description="Use previous output as prompt for next window"
270
+ )
271
+ prompt_reset_on_temperature: float = Field(
272
+ default=0.5,
273
+ ge=0.0,
274
+ le=1.0,
275
+ description="Temperature threshold for resetting prompt"
276
+ )
277
+ initial_prompt: Optional[str] = Field(default=None, description="Initial prompt for first window")
278
+ temperature: float = Field(
279
+ default=0.0,
280
+ ge=0.0,
281
+ description="Temperature for sampling"
282
+ )
283
+ compression_ratio_threshold: float = Field(
284
+ default=2.4,
285
+ gt=0,
286
+ description="Threshold for gzip compression ratio"
287
+ )
288
+ length_penalty: float = Field(default=1.0, gt=0, description="Exponential length penalty")
289
+ repetition_penalty: float = Field(default=1.0, gt=0, description="Penalty for repeated tokens")
290
+ no_repeat_ngram_size: int = Field(default=0, ge=0, description="Size of n-grams to prevent repetition")
291
+ prefix: Optional[str] = Field(default=None, description="Prefix text for first window")
292
+ suppress_blank: bool = Field(
293
+ default=True,
294
+ description="Suppress blank outputs at start of sampling"
295
+ )
296
+ suppress_tokens: Optional[Union[List[int], str]] = Field(default=[-1], description="Token IDs to suppress")
297
+ max_initial_timestamp: float = Field(
298
+ default=1.0,
299
+ ge=0.0,
300
+ description="Maximum initial timestamp"
301
+ )
302
+ word_timestamps: bool = Field(default=False, description="Extract word-level timestamps")
303
+ prepend_punctuations: Optional[str] = Field(
304
+ default="\"'“¿([{-",
305
+ description="Punctuations to merge with next word"
306
+ )
307
+ append_punctuations: Optional[str] = Field(
308
+ default="\"'.。,,!!??::”)]}、",
309
+ description="Punctuations to merge with previous word"
310
+ )
311
+ max_new_tokens: Optional[int] = Field(default=None, description="Maximum number of new tokens per chunk")
312
+ chunk_length: Optional[int] = Field(default=30, description="Length of audio segments in seconds")
313
+ hallucination_silence_threshold: Optional[float] = Field(
314
+ default=None,
315
+ description="Threshold for skipping silent periods in hallucination detection"
316
+ )
317
+ hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
318
+ language_detection_threshold: Optional[float] = Field(
319
+ default=None,
320
+ description="Threshold for language detection probability"
321
+ )
322
+ language_detection_segments: int = Field(
323
+ default=1,
324
+ gt=0,
325
+ description="Number of segments for language detection"
326
+ )
327
+ batch_size: int = Field(default=24, gt=0, description="Batch size for processing")
328
+
329
+ @field_validator('lang')
330
+ def validate_lang(cls, v):
331
+ from modules.utils.constants import AUTOMATIC_DETECTION
332
+ return None if v == AUTOMATIC_DETECTION.unwrap() else v
333
+
334
+ @field_validator('suppress_tokens')
335
+ def validate_supress_tokens(cls, v):
336
+ import ast
337
+ try:
338
+ if isinstance(v, str):
339
+ suppress_tokens = ast.literal_eval(v)
340
+ if not isinstance(suppress_tokens, list):
341
+ raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
342
+ return suppress_tokens
343
+ if isinstance(v, list):
344
+ return v
345
+ except Exception as e:
346
+ raise ValueError(f"Invalid Suppress Tokens. The value must be type of List[int]: {e}")
347
+
348
+ @classmethod
349
+ def to_gradio_inputs(cls,
350
+ defaults: Optional[Dict] = None,
351
+ only_advanced: Optional[bool] = True,
352
+ whisper_type: Optional[str] = None,
353
+ available_models: Optional[List] = None,
354
+ available_langs: Optional[List] = None,
355
+ available_compute_types: Optional[List] = None,
356
+ compute_type: Optional[str] = None):
357
+ whisper_type = WhisperImpl.FASTER_WHISPER.value if whisper_type is None else whisper_type.strip().lower()
358
+
359
+ inputs = []
360
+ if not only_advanced:
361
+ inputs += [
362
+ gr.Dropdown(
363
+ label=_("Model"),
364
+ choices=available_models,
365
+ value=defaults.get("model_size", cls.__fields__["model_size"].default),
366
+ ),
367
+ gr.Dropdown(
368
+ label=_("Language"),
369
+ choices=available_langs,
370
+ value=defaults.get("lang", AUTOMATIC_DETECTION),
371
+ ),
372
+ gr.Checkbox(
373
+ label=_("Translate to English?"),
374
+ value=defaults.get("is_translate", cls.__fields__["is_translate"].default),
375
+ ),
376
+ ]
377
+
378
+ inputs += [
379
+ gr.Number(
380
+ label="Beam Size",
381
+ value=defaults.get("beam_size", cls.__fields__["beam_size"].default),
382
+ precision=0,
383
+ info="Beam size for decoding"
384
+ ),
385
+ gr.Number(
386
+ label="Log Probability Threshold",
387
+ value=defaults.get("log_prob_threshold", cls.__fields__["log_prob_threshold"].default),
388
+ info="Threshold for average log probability of sampled tokens"
389
+ ),
390
+ gr.Number(
391
+ label="No Speech Threshold",
392
+ value=defaults.get("no_speech_threshold", cls.__fields__["no_speech_threshold"].default),
393
+ info="Threshold for detecting silence"
394
+ ),
395
+ gr.Dropdown(
396
+ label="Compute Type",
397
+ choices=["float16", "int8", "int16"] if available_compute_types is None else available_compute_types,
398
+ value=defaults.get("compute_type", compute_type),
399
+ info="Computation type for transcription"
400
+ ),
401
+ gr.Number(
402
+ label="Best Of",
403
+ value=defaults.get("best_of", cls.__fields__["best_of"].default),
404
+ precision=0,
405
+ info="Number of candidates when sampling"
406
+ ),
407
+ gr.Number(
408
+ label="Patience",
409
+ value=defaults.get("patience", cls.__fields__["patience"].default),
410
+ info="Beam search patience factor"
411
+ ),
412
+ gr.Checkbox(
413
+ label="Condition On Previous Text",
414
+ value=defaults.get("condition_on_previous_text", cls.__fields__["condition_on_previous_text"].default),
415
+ info="Use previous output as prompt for next window"
416
+ ),
417
+ gr.Slider(
418
+ label="Prompt Reset On Temperature",
419
+ value=defaults.get("prompt_reset_on_temperature",
420
+ cls.__fields__["prompt_reset_on_temperature"].default),
421
+ minimum=0,
422
+ maximum=1,
423
+ step=0.01,
424
+ info="Temperature threshold for resetting prompt"
425
+ ),
426
+ gr.Textbox(
427
+ label="Initial Prompt",
428
+ value=defaults.get("initial_prompt", GRADIO_NONE_STR),
429
+ info="Initial prompt for first window"
430
+ ),
431
+ gr.Slider(
432
+ label="Temperature",
433
+ value=defaults.get("temperature", cls.__fields__["temperature"].default),
434
+ minimum=0.0,
435
+ step=0.01,
436
+ maximum=1.0,
437
+ info="Temperature for sampling"
438
+ ),
439
+ gr.Number(
440
+ label="Compression Ratio Threshold",
441
+ value=defaults.get("compression_ratio_threshold",
442
+ cls.__fields__["compression_ratio_threshold"].default),
443
+ info="Threshold for gzip compression ratio"
444
+ )
445
+ ]
446
+
447
+ faster_whisper_inputs = [
448
+ gr.Number(
449
+ label="Length Penalty",
450
+ value=defaults.get("length_penalty", cls.__fields__["length_penalty"].default),
451
+ info="Exponential length penalty",
452
+ ),
453
+ gr.Number(
454
+ label="Repetition Penalty",
455
+ value=defaults.get("repetition_penalty", cls.__fields__["repetition_penalty"].default),
456
+ info="Penalty for repeated tokens"
457
+ ),
458
+ gr.Number(
459
+ label="No Repeat N-gram Size",
460
+ value=defaults.get("no_repeat_ngram_size", cls.__fields__["no_repeat_ngram_size"].default),
461
+ precision=0,
462
+ info="Size of n-grams to prevent repetition"
463
+ ),
464
+ gr.Textbox(
465
+ label="Prefix",
466
+ value=defaults.get("prefix", GRADIO_NONE_STR),
467
+ info="Prefix text for first window"
468
+ ),
469
+ gr.Checkbox(
470
+ label="Suppress Blank",
471
+ value=defaults.get("suppress_blank", cls.__fields__["suppress_blank"].default),
472
+ info="Suppress blank outputs at start of sampling"
473
+ ),
474
+ gr.Textbox(
475
+ label="Suppress Tokens",
476
+ value=defaults.get("suppress_tokens", "[-1]"),
477
+ info="Token IDs to suppress"
478
+ ),
479
+ gr.Number(
480
+ label="Max Initial Timestamp",
481
+ value=defaults.get("max_initial_timestamp", cls.__fields__["max_initial_timestamp"].default),
482
+ info="Maximum initial timestamp"
483
+ ),
484
+ gr.Checkbox(
485
+ label="Word Timestamps",
486
+ value=defaults.get("word_timestamps", cls.__fields__["word_timestamps"].default),
487
+ info="Extract word-level timestamps"
488
+ ),
489
+ gr.Textbox(
490
+ label="Prepend Punctuations",
491
+ value=defaults.get("prepend_punctuations", cls.__fields__["prepend_punctuations"].default),
492
+ info="Punctuations to merge with next word"
493
+ ),
494
+ gr.Textbox(
495
+ label="Append Punctuations",
496
+ value=defaults.get("append_punctuations", cls.__fields__["append_punctuations"].default),
497
+ info="Punctuations to merge with previous word"
498
+ ),
499
+ gr.Number(
500
+ label="Max New Tokens",
501
+ value=defaults.get("max_new_tokens", GRADIO_NONE_NUMBER_MIN),
502
+ precision=0,
503
+ info="Maximum number of new tokens per chunk"
504
+ ),
505
+ gr.Number(
506
+ label="Chunk Length (s)",
507
+ value=defaults.get("chunk_length", cls.__fields__["chunk_length"].default),
508
+ precision=0,
509
+ info="Length of audio segments in seconds"
510
+ ),
511
+ gr.Number(
512
+ label="Hallucination Silence Threshold (sec)",
513
+ value=defaults.get("hallucination_silence_threshold",
514
+ GRADIO_NONE_NUMBER_MIN),
515
+ info="Threshold for skipping silent periods in hallucination detection"
516
+ ),
517
+ gr.Textbox(
518
+ label="Hotwords",
519
+ value=defaults.get("hotwords", cls.__fields__["hotwords"].default),
520
+ info="Hotwords/hint phrases for the model"
521
+ ),
522
+ gr.Number(
523
+ label="Language Detection Threshold",
524
+ value=defaults.get("language_detection_threshold",
525
+ GRADIO_NONE_NUMBER_MIN),
526
+ info="Threshold for language detection probability"
527
+ ),
528
+ gr.Number(
529
+ label="Language Detection Segments",
530
+ value=defaults.get("language_detection_segments",
531
+ cls.__fields__["language_detection_segments"].default),
532
+ precision=0,
533
+ info="Number of segments for language detection"
534
+ )
535
+ ]
536
+
537
+ insanely_fast_whisper_inputs = [
538
+ gr.Number(
539
+ label="Batch Size",
540
+ value=defaults.get("batch_size", cls.__fields__["batch_size"].default),
541
+ precision=0,
542
+ info="Batch size for processing"
543
+ )
544
+ ]
545
+
546
+ if whisper_type != WhisperImpl.FASTER_WHISPER.value:
547
+ for input_component in faster_whisper_inputs:
548
+ input_component.visible = False
549
+
550
+ if whisper_type != WhisperImpl.INSANELY_FAST_WHISPER.value:
551
+ for input_component in insanely_fast_whisper_inputs:
552
+ input_component.visible = False
553
+
554
+ inputs += faster_whisper_inputs + insanely_fast_whisper_inputs
555
+
556
+ return inputs
557
+
558
+
559
+ class TranscriptionPipelineParams(BaseModel):
560
+ """Transcription pipeline parameters"""
561
+ whisper: WhisperParams = Field(default_factory=WhisperParams)
562
+ vad: VadParams = Field(default_factory=VadParams)
563
+ diarization: DiarizationParams = Field(default_factory=DiarizationParams)
564
+ bgm_separation: BGMSeparationParams = Field(default_factory=BGMSeparationParams)
565
+
566
+ def to_dict(self) -> Dict:
567
+ data = {
568
+ "whisper": self.whisper.to_dict(),
569
+ "vad": self.vad.to_dict(),
570
+ "diarization": self.diarization.to_dict(),
571
+ "bgm_separation": self.bgm_separation.to_dict()
572
+ }
573
+ return data
574
+
575
+ def to_list(self) -> List:
576
+ """
577
+ Convert data class to the list because I have to pass the parameters as a list in the gradio.
578
+ Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
579
+ See more about Gradio pre-processing: https://www.gradio.app/docs/components
580
+ """
581
+ whisper_list = self.whisper.to_list()
582
+ vad_list = self.vad.to_list()
583
+ diarization_list = self.diarization.to_list()
584
+ bgm_sep_list = self.bgm_separation.to_list()
585
+ return whisper_list + vad_list + diarization_list + bgm_sep_list
586
+
587
+ @staticmethod
588
+ def from_list(pipeline_list: List) -> 'TranscriptionPipelineParams':
589
+ """Convert list to the data class again to use it in a function."""
590
+ data_list = deepcopy(pipeline_list)
591
+
592
+ whisper_list = data_list[0:len(WhisperParams.__annotations__)]
593
+ data_list = data_list[len(WhisperParams.__annotations__):]
594
+
595
+ vad_list = data_list[0:len(VadParams.__annotations__)]
596
+ data_list = data_list[len(VadParams.__annotations__):]
597
+
598
+ diarization_list = data_list[0:len(DiarizationParams.__annotations__)]
599
+ data_list = data_list[len(DiarizationParams.__annotations__):]
600
+
601
+ bgm_sep_list = data_list[0:len(BGMSeparationParams.__annotations__)]
602
+
603
+ return TranscriptionPipelineParams(
604
+ whisper=WhisperParams.from_list(whisper_list),
605
+ vad=VadParams.from_list(vad_list),
606
+ diarization=DiarizationParams.from_list(diarization_list),
607
+ bgm_separation=BGMSeparationParams.from_list(bgm_sep_list)
608
+ )
modules/whisper/faster_whisper_inference.py CHANGED
@@ -12,11 +12,11 @@ import gradio as gr
12
  from argparse import Namespace
13
 
14
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
- from modules.whisper.whisper_parameter import *
16
- from modules.whisper.whisper_base import WhisperBase
17
 
18
 
19
- class FasterWhisperInference(WhisperBase):
20
  def __init__(self,
21
  model_dir: str = FASTER_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
@@ -35,14 +35,12 @@ class FasterWhisperInference(WhisperBase):
35
  self.model_paths = self.get_model_paths()
36
  self.device = self.get_device()
37
  self.available_models = self.model_paths.keys()
38
- self.available_compute_types = ctranslate2.get_supported_compute_types(
39
- "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
40
 
41
  def transcribe(self,
42
  audio: Union[str, BinaryIO, np.ndarray],
43
  progress: gr.Progress = gr.Progress(),
44
  *whisper_params,
45
- ) -> Tuple[List[dict], float]:
46
  """
47
  transcribe method for faster-whisper.
48
 
@@ -57,28 +55,18 @@ class FasterWhisperInference(WhisperBase):
57
 
58
  Returns
59
  ----------
60
- segments_result: List[dict]
61
- list of dicts that includes start, end timestamps and transcribed text
62
  elapsed_time: float
63
  elapsed time for transcription
64
  """
65
  start_time = time.time()
66
 
67
- params = WhisperParameters.as_value(*whisper_params)
68
 
69
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
70
  self.update_model(params.model_size, params.compute_type, progress)
71
 
72
- # None parameters with Textboxes: https://github.com/gradio-app/gradio/issues/8723
73
- if not params.initial_prompt:
74
- params.initial_prompt = None
75
- if not params.prefix:
76
- params.prefix = None
77
- if not params.hotwords:
78
- params.hotwords = None
79
-
80
- params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
81
-
82
  segments, info = self.model.transcribe(
83
  audio=audio,
84
  language=params.lang,
@@ -114,11 +102,7 @@ class FasterWhisperInference(WhisperBase):
114
  segments_result = []
115
  for segment in segments:
116
  progress(segment.start / info.duration, desc="Transcribing..")
117
- segments_result.append({
118
- "start": segment.start,
119
- "end": segment.end,
120
- "text": segment.text
121
- })
122
 
123
  elapsed_time = time.time() - start_time
124
  return segments_result, elapsed_time
 
12
  from argparse import Namespace
13
 
14
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
+ from modules.whisper.data_classes import *
16
+ from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
17
 
18
 
19
+ class FasterWhisperInference(BaseTranscriptionPipeline):
20
  def __init__(self,
21
  model_dir: str = FASTER_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
35
  self.model_paths = self.get_model_paths()
36
  self.device = self.get_device()
37
  self.available_models = self.model_paths.keys()
 
 
38
 
39
  def transcribe(self,
40
  audio: Union[str, BinaryIO, np.ndarray],
41
  progress: gr.Progress = gr.Progress(),
42
  *whisper_params,
43
+ ) -> Tuple[List[Segment], float]:
44
  """
45
  transcribe method for faster-whisper.
46
 
 
55
 
56
  Returns
57
  ----------
58
+ segments_result: List[Segment]
59
+ list of Segment that includes start, end timestamps and transcribed text
60
  elapsed_time: float
61
  elapsed time for transcription
62
  """
63
  start_time = time.time()
64
 
65
+ params = WhisperParams.from_list(list(whisper_params))
66
 
67
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
68
  self.update_model(params.model_size, params.compute_type, progress)
69
 
 
 
 
 
 
 
 
 
 
 
70
  segments, info = self.model.transcribe(
71
  audio=audio,
72
  language=params.lang,
 
102
  segments_result = []
103
  for segment in segments:
104
  progress(segment.start / info.duration, desc="Transcribing..")
105
+ segments_result.append(Segment.from_faster_whisper(segment))
 
 
 
 
106
 
107
  elapsed_time = time.time() - start_time
108
  return segments_result, elapsed_time
modules/whisper/insanely_fast_whisper_inference.py CHANGED
@@ -12,11 +12,11 @@ from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
12
  from argparse import Namespace
13
 
14
  from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
- from modules.whisper.whisper_parameter import *
16
- from modules.whisper.whisper_base import WhisperBase
17
 
18
 
19
- class InsanelyFastWhisperInference(WhisperBase):
20
  def __init__(self,
21
  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
@@ -32,16 +32,13 @@ class InsanelyFastWhisperInference(WhisperBase):
32
  self.model_dir = model_dir
33
  os.makedirs(self.model_dir, exist_ok=True)
34
 
35
- openai_models = whisper.available_models()
36
- distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
37
- self.available_models = openai_models + distil_models
38
- self.available_compute_types = ["float16"]
39
 
40
  def transcribe(self,
41
  audio: Union[str, np.ndarray, torch.Tensor],
42
  progress: gr.Progress = gr.Progress(),
43
  *whisper_params,
44
- ) -> Tuple[List[dict], float]:
45
  """
46
  transcribe method for faster-whisper.
47
 
@@ -56,13 +53,13 @@ class InsanelyFastWhisperInference(WhisperBase):
56
 
57
  Returns
58
  ----------
59
- segments_result: List[dict]
60
- list of dicts that includes start, end timestamps and transcribed text
61
  elapsed_time: float
62
  elapsed time for transcription
63
  """
64
  start_time = time.time()
65
- params = WhisperParameters.as_value(*whisper_params)
66
 
67
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
68
  self.update_model(params.model_size, params.compute_type, progress)
@@ -96,9 +93,17 @@ class InsanelyFastWhisperInference(WhisperBase):
96
  generate_kwargs=kwargs
97
  )
98
 
99
- segments_result = self.format_result(
100
- transcribed_result=segments,
101
- )
 
 
 
 
 
 
 
 
102
  elapsed_time = time.time() - start_time
103
  return segments_result, elapsed_time
104
 
@@ -139,31 +144,26 @@ class InsanelyFastWhisperInference(WhisperBase):
139
  model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
140
  )
141
 
142
- @staticmethod
143
- def format_result(
144
- transcribed_result: dict
145
- ) -> List[dict]:
146
  """
147
- Format the transcription result of insanely_fast_whisper as the same with other implementation.
148
-
149
- Parameters
150
- ----------
151
- transcribed_result: dict
152
- Transcription result of the insanely_fast_whisper
153
 
154
  Returns
155
  ----------
156
- result: List[dict]
157
- Formatted result as the same with other implementation
158
  """
159
- result = transcribed_result["chunks"]
160
- for item in result:
161
- start, end = item["timestamp"][0], item["timestamp"][1]
162
- if end is None:
163
- end = start
164
- item["start"] = start
165
- item["end"] = end
166
- return result
 
 
 
 
167
 
168
  @staticmethod
169
  def download_model(
 
12
  from argparse import Namespace
13
 
14
  from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
+ from modules.whisper.data_classes import *
16
+ from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
17
 
18
 
19
+ class InsanelyFastWhisperInference(BaseTranscriptionPipeline):
20
  def __init__(self,
21
  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
32
  self.model_dir = model_dir
33
  os.makedirs(self.model_dir, exist_ok=True)
34
 
35
+ self.available_models = self.get_model_paths()
 
 
 
36
 
37
  def transcribe(self,
38
  audio: Union[str, np.ndarray, torch.Tensor],
39
  progress: gr.Progress = gr.Progress(),
40
  *whisper_params,
41
+ ) -> Tuple[List[Segment], float]:
42
  """
43
  transcribe method for faster-whisper.
44
 
 
53
 
54
  Returns
55
  ----------
56
+ segments_result: List[Segment]
57
+ list of Segment that includes start, end timestamps and transcribed text
58
  elapsed_time: float
59
  elapsed time for transcription
60
  """
61
  start_time = time.time()
62
+ params = WhisperParams.from_list(list(whisper_params))
63
 
64
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
65
  self.update_model(params.model_size, params.compute_type, progress)
 
93
  generate_kwargs=kwargs
94
  )
95
 
96
+ segments_result = []
97
+ for item in segments["chunks"]:
98
+ start, end = item["timestamp"][0], item["timestamp"][1]
99
+ if end is None:
100
+ end = start
101
+ segments_result.append(Segment(
102
+ text=item["text"],
103
+ start=start,
104
+ end=end
105
+ ))
106
+
107
  elapsed_time = time.time() - start_time
108
  return segments_result, elapsed_time
109
 
 
144
  model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
145
  )
146
 
147
+ def get_model_paths(self):
 
 
 
148
  """
149
+ Get available models from models path including fine-tuned model.
 
 
 
 
 
150
 
151
  Returns
152
  ----------
153
+ Name set of models
 
154
  """
155
+ openai_models = whisper.available_models()
156
+ distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
157
+ default_models = openai_models + distil_models
158
+
159
+ existing_models = os.listdir(self.model_dir)
160
+ wrong_dirs = [".locks"]
161
+
162
+ available_models = default_models + existing_models
163
+ available_models = [model for model in available_models if model not in wrong_dirs]
164
+ available_models = sorted(set(available_models), key=available_models.index)
165
+
166
+ return available_models
167
 
168
  @staticmethod
169
  def download_model(
modules/whisper/whisper_Inference.py CHANGED
@@ -8,11 +8,11 @@ import os
8
  from argparse import Namespace
9
 
10
  from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
11
- from modules.whisper.whisper_base import WhisperBase
12
- from modules.whisper.whisper_parameter import *
13
 
14
 
15
- class WhisperInference(WhisperBase):
16
  def __init__(self,
17
  model_dir: str = WHISPER_MODELS_DIR,
18
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
@@ -30,7 +30,7 @@ class WhisperInference(WhisperBase):
30
  audio: Union[str, np.ndarray, torch.Tensor],
31
  progress: gr.Progress = gr.Progress(),
32
  *whisper_params,
33
- ) -> Tuple[List[dict], float]:
34
  """
35
  transcribe method for faster-whisper.
36
 
@@ -45,13 +45,13 @@ class WhisperInference(WhisperBase):
45
 
46
  Returns
47
  ----------
48
- segments_result: List[dict]
49
- list of dicts that includes start, end timestamps and transcribed text
50
  elapsed_time: float
51
  elapsed time for transcription
52
  """
53
  start_time = time.time()
54
- params = WhisperParameters.as_value(*whisper_params)
55
 
56
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
57
  self.update_model(params.model_size, params.compute_type, progress)
@@ -59,21 +59,28 @@ class WhisperInference(WhisperBase):
59
  def progress_callback(progress_value):
60
  progress(progress_value, desc="Transcribing..")
61
 
62
- segments_result = self.model.transcribe(audio=audio,
63
- language=params.lang,
64
- verbose=False,
65
- beam_size=params.beam_size,
66
- logprob_threshold=params.log_prob_threshold,
67
- no_speech_threshold=params.no_speech_threshold,
68
- task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
69
- fp16=True if params.compute_type == "float16" else False,
70
- best_of=params.best_of,
71
- patience=params.patience,
72
- temperature=params.temperature,
73
- compression_ratio_threshold=params.compression_ratio_threshold,
74
- progress_callback=progress_callback,)["segments"]
75
- elapsed_time = time.time() - start_time
 
 
 
 
 
 
76
 
 
77
  return segments_result, elapsed_time
78
 
79
  def update_model(self,
 
8
  from argparse import Namespace
9
 
10
  from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
11
+ from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
12
+ from modules.whisper.data_classes import *
13
 
14
 
15
+ class WhisperInference(BaseTranscriptionPipeline):
16
  def __init__(self,
17
  model_dir: str = WHISPER_MODELS_DIR,
18
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
30
  audio: Union[str, np.ndarray, torch.Tensor],
31
  progress: gr.Progress = gr.Progress(),
32
  *whisper_params,
33
+ ) -> Tuple[List[Segment], float]:
34
  """
35
  transcribe method for faster-whisper.
36
 
 
45
 
46
  Returns
47
  ----------
48
+ segments_result: List[Segment]
49
+ list of Segment that includes start, end timestamps and transcribed text
50
  elapsed_time: float
51
  elapsed time for transcription
52
  """
53
  start_time = time.time()
54
+ params = WhisperParams.from_list(list(whisper_params))
55
 
56
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
57
  self.update_model(params.model_size, params.compute_type, progress)
 
59
  def progress_callback(progress_value):
60
  progress(progress_value, desc="Transcribing..")
61
 
62
+ result = self.model.transcribe(audio=audio,
63
+ language=params.lang,
64
+ verbose=False,
65
+ beam_size=params.beam_size,
66
+ logprob_threshold=params.log_prob_threshold,
67
+ no_speech_threshold=params.no_speech_threshold,
68
+ task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
69
+ fp16=True if params.compute_type == "float16" else False,
70
+ best_of=params.best_of,
71
+ patience=params.patience,
72
+ temperature=params.temperature,
73
+ compression_ratio_threshold=params.compression_ratio_threshold,
74
+ progress_callback=progress_callback,)["segments"]
75
+ segments_result = []
76
+ for segment in result:
77
+ segments_result.append(Segment(
78
+ start=segment["start"],
79
+ end=segment["end"],
80
+ text=segment["text"]
81
+ ))
82
 
83
+ elapsed_time = time.time() - start_time
84
  return segments_result, elapsed_time
85
 
86
  def update_model(self,
modules/whisper/whisper_factory.py CHANGED
@@ -6,7 +6,8 @@ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_D
6
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
7
  from modules.whisper.whisper_Inference import WhisperInference
8
  from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
9
- from modules.whisper.whisper_base import WhisperBase
 
10
 
11
 
12
  class WhisperFactory:
@@ -19,7 +20,7 @@ class WhisperFactory:
19
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
20
  uvr_model_dir: str = UVR_MODELS_DIR,
21
  output_dir: str = OUTPUT_DIR,
22
- ) -> "WhisperBase":
23
  """
24
  Create a whisper inference class based on the provided whisper_type.
25
 
@@ -45,36 +46,29 @@ class WhisperFactory:
45
 
46
  Returns
47
  -------
48
- WhisperBase
49
  An instance of the appropriate whisper inference class based on the whisper_type.
50
  """
51
  # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
52
  os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
53
 
54
- whisper_type = whisper_type.lower().strip()
55
 
56
- faster_whisper_typos = ["faster_whisper", "faster-whisper", "fasterwhisper"]
57
- whisper_typos = ["whisper"]
58
- insanely_fast_whisper_typos = [
59
- "insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
60
- "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"
61
- ]
62
-
63
- if whisper_type in faster_whisper_typos:
64
  return FasterWhisperInference(
65
  model_dir=faster_whisper_model_dir,
66
  output_dir=output_dir,
67
  diarization_model_dir=diarization_model_dir,
68
  uvr_model_dir=uvr_model_dir
69
  )
70
- elif whisper_type in whisper_typos:
71
  return WhisperInference(
72
  model_dir=whisper_model_dir,
73
  output_dir=output_dir,
74
  diarization_model_dir=diarization_model_dir,
75
  uvr_model_dir=uvr_model_dir
76
  )
77
- elif whisper_type in insanely_fast_whisper_typos:
78
  return InsanelyFastWhisperInference(
79
  model_dir=insanely_fast_whisper_model_dir,
80
  output_dir=output_dir,
 
6
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
7
  from modules.whisper.whisper_Inference import WhisperInference
8
  from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
9
+ from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
10
+ from modules.whisper.data_classes import *
11
 
12
 
13
  class WhisperFactory:
 
20
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
21
  uvr_model_dir: str = UVR_MODELS_DIR,
22
  output_dir: str = OUTPUT_DIR,
23
+ ) -> "BaseTranscriptionPipeline":
24
  """
25
  Create a whisper inference class based on the provided whisper_type.
26
 
 
46
 
47
  Returns
48
  -------
49
+ BaseTranscriptionPipeline
50
  An instance of the appropriate whisper inference class based on the whisper_type.
51
  """
52
  # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
53
  os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
54
 
55
+ whisper_type = whisper_type.strip().lower()
56
 
57
+ if whisper_type == WhisperImpl.FASTER_WHISPER.value:
 
 
 
 
 
 
 
58
  return FasterWhisperInference(
59
  model_dir=faster_whisper_model_dir,
60
  output_dir=output_dir,
61
  diarization_model_dir=diarization_model_dir,
62
  uvr_model_dir=uvr_model_dir
63
  )
64
+ elif whisper_type == WhisperImpl.WHISPER.value:
65
  return WhisperInference(
66
  model_dir=whisper_model_dir,
67
  output_dir=output_dir,
68
  diarization_model_dir=diarization_model_dir,
69
  uvr_model_dir=uvr_model_dir
70
  )
71
+ elif whisper_type == WhisperImpl.INSANELY_FAST_WHISPER.value:
72
  return InsanelyFastWhisperInference(
73
  model_dir=insanely_fast_whisper_model_dir,
74
  output_dir=output_dir,
modules/whisper/whisper_parameter.py DELETED
@@ -1,369 +0,0 @@
1
- from dataclasses import dataclass, fields
2
- import gradio as gr
3
- from typing import Optional, Dict
4
- import yaml
5
-
6
-
7
- @dataclass
8
- class WhisperParameters:
9
- model_size: gr.Dropdown
10
- lang: gr.Dropdown
11
- is_translate: gr.Checkbox
12
- beam_size: gr.Number
13
- log_prob_threshold: gr.Number
14
- no_speech_threshold: gr.Number
15
- compute_type: gr.Dropdown
16
- best_of: gr.Number
17
- patience: gr.Number
18
- condition_on_previous_text: gr.Checkbox
19
- prompt_reset_on_temperature: gr.Slider
20
- initial_prompt: gr.Textbox
21
- temperature: gr.Slider
22
- compression_ratio_threshold: gr.Number
23
- vad_filter: gr.Checkbox
24
- threshold: gr.Slider
25
- min_speech_duration_ms: gr.Number
26
- max_speech_duration_s: gr.Number
27
- min_silence_duration_ms: gr.Number
28
- speech_pad_ms: gr.Number
29
- batch_size: gr.Number
30
- is_diarize: gr.Checkbox
31
- hf_token: gr.Textbox
32
- diarization_device: gr.Dropdown
33
- length_penalty: gr.Number
34
- repetition_penalty: gr.Number
35
- no_repeat_ngram_size: gr.Number
36
- prefix: gr.Textbox
37
- suppress_blank: gr.Checkbox
38
- suppress_tokens: gr.Textbox
39
- max_initial_timestamp: gr.Number
40
- word_timestamps: gr.Checkbox
41
- prepend_punctuations: gr.Textbox
42
- append_punctuations: gr.Textbox
43
- max_new_tokens: gr.Number
44
- chunk_length: gr.Number
45
- hallucination_silence_threshold: gr.Number
46
- hotwords: gr.Textbox
47
- language_detection_threshold: gr.Number
48
- language_detection_segments: gr.Number
49
- is_bgm_separate: gr.Checkbox
50
- uvr_model_size: gr.Dropdown
51
- uvr_device: gr.Dropdown
52
- uvr_segment_size: gr.Number
53
- uvr_save_file: gr.Checkbox
54
- uvr_enable_offload: gr.Checkbox
55
- """
56
- A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
57
- This data class is used to mitigate the key-value problem between Gradio components and function parameters.
58
- Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
59
- See more about Gradio pre-processing: https://www.gradio.app/docs/components
60
-
61
- Attributes
62
- ----------
63
- model_size: gr.Dropdown
64
- Whisper model size.
65
-
66
- lang: gr.Dropdown
67
- Source language of the file to transcribe.
68
-
69
- is_translate: gr.Checkbox
70
- Boolean value that determines whether to translate to English.
71
- It's Whisper's feature to translate speech from another language directly into English end-to-end.
72
-
73
- beam_size: gr.Number
74
- Int value that is used for decoding option.
75
-
76
- log_prob_threshold: gr.Number
77
- If the average log probability over sampled tokens is below this value, treat as failed.
78
-
79
- no_speech_threshold: gr.Number
80
- If the no_speech probability is higher than this value AND
81
- the average log probability over sampled tokens is below `log_prob_threshold`,
82
- consider the segment as silent.
83
-
84
- compute_type: gr.Dropdown
85
- compute type for transcription.
86
- see more info : https://opennmt.net/CTranslate2/quantization.html
87
-
88
- best_of: gr.Number
89
- Number of candidates when sampling with non-zero temperature.
90
-
91
- patience: gr.Number
92
- Beam search patience factor.
93
-
94
- condition_on_previous_text: gr.Checkbox
95
- if True, the previous output of the model is provided as a prompt for the next window;
96
- disabling may make the text inconsistent across windows, but the model becomes less prone to
97
- getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
98
-
99
- initial_prompt: gr.Textbox
100
- Optional text to provide as a prompt for the first window. This can be used to provide, or
101
- "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
102
- to make it more likely to predict those word correctly.
103
-
104
- temperature: gr.Slider
105
- Temperature for sampling. It can be a tuple of temperatures,
106
- which will be successively used upon failures according to either
107
- `compression_ratio_threshold` or `log_prob_threshold`.
108
-
109
- compression_ratio_threshold: gr.Number
110
- If the gzip compression ratio is above this value, treat as failed
111
-
112
- vad_filter: gr.Checkbox
113
- Enable the voice activity detection (VAD) to filter out parts of the audio
114
- without speech. This step is using the Silero VAD model
115
- https://github.com/snakers4/silero-vad.
116
-
117
- threshold: gr.Slider
118
- This parameter is related with Silero VAD. Speech threshold.
119
- Silero VAD outputs speech probabilities for each audio chunk,
120
- probabilities ABOVE this value are considered as SPEECH. It is better to tune this
121
- parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
122
-
123
- min_speech_duration_ms: gr.Number
124
- This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
125
-
126
- max_speech_duration_s: gr.Number
127
- This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
128
- than max_speech_duration_s will be split at the timestamp of the last silence that
129
- lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
130
- split aggressively just before max_speech_duration_s.
131
-
132
- min_silence_duration_ms: gr.Number
133
- This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
134
- before separating it
135
-
136
- speech_pad_ms: gr.Number
137
- This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
138
-
139
- batch_size: gr.Number
140
- This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
141
-
142
- is_diarize: gr.Checkbox
143
- This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
144
-
145
- hf_token: gr.Textbox
146
- This parameter is related with whisperx. Huggingface token is needed to download diarization models.
147
- Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
148
-
149
- diarization_device: gr.Dropdown
150
- This parameter is related with whisperx. Device to run diarization model
151
-
152
- length_penalty: gr.Number
153
- This parameter is related to faster-whisper. Exponential length penalty constant.
154
-
155
- repetition_penalty: gr.Number
156
- This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
157
- (set > 1 to penalize).
158
-
159
- no_repeat_ngram_size: gr.Number
160
- This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
161
-
162
- prefix: gr.Textbox
163
- This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
164
-
165
- suppress_blank: gr.Checkbox
166
- This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
167
-
168
- suppress_tokens: gr.Textbox
169
- This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
170
- of symbols as defined in the model config.json file.
171
-
172
- max_initial_timestamp: gr.Number
173
- This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
174
-
175
- word_timestamps: gr.Checkbox
176
- This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
177
- and dynamic time warping, and include the timestamps for each word in each segment.
178
-
179
- prepend_punctuations: gr.Textbox
180
- This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
181
- with the next word.
182
-
183
- append_punctuations: gr.Textbox
184
- This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
185
- with the previous word.
186
-
187
- max_new_tokens: gr.Number
188
- This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
189
- the maximum will be set by the default max_length.
190
-
191
- chunk_length: gr.Number
192
- This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
193
- If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
194
-
195
- hallucination_silence_threshold: gr.Number
196
- This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
197
- (in seconds) when a possible hallucination is detected.
198
-
199
- hotwords: gr.Textbox
200
- This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
201
-
202
- language_detection_threshold: gr.Number
203
- This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
204
-
205
- language_detection_segments: gr.Number
206
- This parameter is related to faster-whisper. Number of segments to consider for the language detection.
207
-
208
- is_separate_bgm: gr.Checkbox
209
- This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
210
-
211
- uvr_model_size: gr.Dropdown
212
- This parameter is related to UVR. UVR model size.
213
-
214
- uvr_device: gr.Dropdown
215
- This parameter is related to UVR. Device to run UVR model.
216
-
217
- uvr_segment_size: gr.Number
218
- This parameter is related to UVR. Segment size for UVR model.
219
-
220
- uvr_save_file: gr.Checkbox
221
- This parameter is related to UVR. Boolean value that determines whether to save the file or not.
222
-
223
- uvr_enable_offload: gr.Checkbox
224
- This parameter is related to UVR. Boolean value that determines whether to offload the UVR model or not
225
- after each transcription.
226
- """
227
-
228
- def as_list(self) -> list:
229
- """
230
- Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
231
- See more about Gradio pre-processing: : https://www.gradio.app/docs/components
232
-
233
- Returns
234
- ----------
235
- A list of Gradio components
236
- """
237
- return [getattr(self, f.name) for f in fields(self)]
238
-
239
- @staticmethod
240
- def as_value(*args) -> 'WhisperValues':
241
- """
242
- To use Whisper parameters in function after Gradio post-processing.
243
- See more about Gradio post-processing: : https://www.gradio.app/docs/components
244
-
245
- Returns
246
- ----------
247
- WhisperValues
248
- Data class that has values of parameters
249
- """
250
- return WhisperValues(*args)
251
-
252
-
253
- @dataclass
254
- class WhisperValues:
255
- model_size: str = "large-v2"
256
- lang: Optional[str] = None
257
- is_translate: bool = False
258
- beam_size: int = 5
259
- log_prob_threshold: float = -1.0
260
- no_speech_threshold: float = 0.6
261
- compute_type: str = "float16"
262
- best_of: int = 5
263
- patience: float = 1.0
264
- condition_on_previous_text: bool = True
265
- prompt_reset_on_temperature: float = 0.5
266
- initial_prompt: Optional[str] = None
267
- temperature: float = 0.0
268
- compression_ratio_threshold: float = 2.4
269
- vad_filter: bool = False
270
- threshold: float = 0.5
271
- min_speech_duration_ms: int = 250
272
- max_speech_duration_s: float = float("inf")
273
- min_silence_duration_ms: int = 2000
274
- speech_pad_ms: int = 400
275
- batch_size: int = 24
276
- is_diarize: bool = False
277
- hf_token: str = ""
278
- diarization_device: str = "cuda"
279
- length_penalty: float = 1.0
280
- repetition_penalty: float = 1.0
281
- no_repeat_ngram_size: int = 0
282
- prefix: Optional[str] = None
283
- suppress_blank: bool = True
284
- suppress_tokens: Optional[str] = "[-1]"
285
- max_initial_timestamp: float = 0.0
286
- word_timestamps: bool = False
287
- prepend_punctuations: Optional[str] = "\"'“¿([{-"
288
- append_punctuations: Optional[str] = "\"'.。,,!!??::”)]}、"
289
- max_new_tokens: Optional[int] = None
290
- chunk_length: Optional[int] = 30
291
- hallucination_silence_threshold: Optional[float] = None
292
- hotwords: Optional[str] = None
293
- language_detection_threshold: Optional[float] = None
294
- language_detection_segments: int = 1
295
- is_bgm_separate: bool = False
296
- uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
297
- uvr_device: str = "cuda"
298
- uvr_segment_size: int = 256
299
- uvr_save_file: bool = False
300
- uvr_enable_offload: bool = True
301
- """
302
- A data class to use Whisper parameters.
303
- """
304
-
305
- def to_yaml(self) -> Dict:
306
- data = {
307
- "whisper": {
308
- "model_size": self.model_size,
309
- "lang": "Automatic Detection" if self.lang is None else self.lang,
310
- "is_translate": self.is_translate,
311
- "beam_size": self.beam_size,
312
- "log_prob_threshold": self.log_prob_threshold,
313
- "no_speech_threshold": self.no_speech_threshold,
314
- "best_of": self.best_of,
315
- "patience": self.patience,
316
- "condition_on_previous_text": self.condition_on_previous_text,
317
- "prompt_reset_on_temperature": self.prompt_reset_on_temperature,
318
- "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
319
- "temperature": self.temperature,
320
- "compression_ratio_threshold": self.compression_ratio_threshold,
321
- "batch_size": self.batch_size,
322
- "length_penalty": self.length_penalty,
323
- "repetition_penalty": self.repetition_penalty,
324
- "no_repeat_ngram_size": self.no_repeat_ngram_size,
325
- "prefix": None if not self.prefix else self.prefix,
326
- "suppress_blank": self.suppress_blank,
327
- "suppress_tokens": self.suppress_tokens,
328
- "max_initial_timestamp": self.max_initial_timestamp,
329
- "word_timestamps": self.word_timestamps,
330
- "prepend_punctuations": self.prepend_punctuations,
331
- "append_punctuations": self.append_punctuations,
332
- "max_new_tokens": self.max_new_tokens,
333
- "chunk_length": self.chunk_length,
334
- "hallucination_silence_threshold": self.hallucination_silence_threshold,
335
- "hotwords": None if not self.hotwords else self.hotwords,
336
- "language_detection_threshold": self.language_detection_threshold,
337
- "language_detection_segments": self.language_detection_segments,
338
- },
339
- "vad": {
340
- "vad_filter": self.vad_filter,
341
- "threshold": self.threshold,
342
- "min_speech_duration_ms": self.min_speech_duration_ms,
343
- "max_speech_duration_s": self.max_speech_duration_s,
344
- "min_silence_duration_ms": self.min_silence_duration_ms,
345
- "speech_pad_ms": self.speech_pad_ms,
346
- },
347
- "diarization": {
348
- "is_diarize": self.is_diarize,
349
- "hf_token": self.hf_token
350
- },
351
- "bgm_separation": {
352
- "is_separate_bgm": self.is_bgm_separate,
353
- "model_size": self.uvr_model_size,
354
- "segment_size": self.uvr_segment_size,
355
- "save_file": self.uvr_save_file,
356
- "enable_offload": self.uvr_enable_offload
357
- },
358
- }
359
- return data
360
-
361
- def as_list(self) -> list:
362
- """
363
- Converts the data class attributes into a list
364
-
365
- Returns
366
- ----------
367
- A list of Whisper parameters
368
- """
369
- return [getattr(self, f.name) for f in fields(self)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebook/whisper-webui.ipynb CHANGED
@@ -54,7 +54,9 @@
54
  "%cd Whisper-WebUI\n",
55
  "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
56
  "!pip install faster-whisper==1.0.3\n",
57
- "!pip install gradio==4.43.0\n",
 
 
58
  "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
59
  "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
60
  "!pip install tokenizers==0.19.1\n",
 
54
  "%cd Whisper-WebUI\n",
55
  "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
56
  "!pip install faster-whisper==1.0.3\n",
57
+ "!pip install ctranslate2==4.4.0\n",
58
+ "!pip install gradio\n",
59
+ "!pip install gradio-i18n\n",
60
  "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
61
  "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
62
  "!pip install tokenizers==0.19.1\n",
requirements.txt CHANGED
@@ -2,15 +2,16 @@
2
  # If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
3
  # For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
4
  # For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
5
- --extra-index-url https://download.pytorch.org/whl/cu121
6
 
7
 
8
- torch==2.3.1
9
- torchaudio==2.3.1
10
  git+https://github.com/jhj0517/jhj0517-whisper.git
11
  faster-whisper==1.0.3
12
  transformers
13
  gradio
 
14
  pytubefix
15
  ruamel.yaml==0.18.6
16
  pyannote.audio==3.3.1
 
2
  # If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
3
  # For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
4
  # For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
5
+ --extra-index-url https://download.pytorch.org/whl/cu124
6
 
7
 
8
+ torch
9
+ torchaudio
10
  git+https://github.com/jhj0517/jhj0517-whisper.git
11
  faster-whisper==1.0.3
12
  transformers
13
  gradio
14
+ gradio-i18n
15
  pytubefix
16
  ruamel.yaml==0.18.6
17
  pyannote.audio==3.3.1
screenshot.png CHANGED
tests/test_bgm_separation.py CHANGED
@@ -1,6 +1,6 @@
1
  from modules.utils.paths import *
2
  from modules.whisper.whisper_factory import WhisperFactory
3
- from modules.whisper.whisper_parameter import WhisperValues
4
  from test_config import *
5
  from test_transcription import download_file, test_transcribe
6
 
@@ -17,9 +17,9 @@ import os
17
  @pytest.mark.parametrize(
18
  "whisper_type,vad_filter,bgm_separation,diarization",
19
  [
20
- ("whisper", False, True, False),
21
- ("faster-whisper", False, True, False),
22
- ("insanely_fast_whisper", False, True, False)
23
  ]
24
  )
25
  def test_bgm_separation_pipeline(
@@ -38,9 +38,9 @@ def test_bgm_separation_pipeline(
38
  @pytest.mark.parametrize(
39
  "whisper_type,vad_filter,bgm_separation,diarization",
40
  [
41
- ("whisper", True, True, False),
42
- ("faster-whisper", True, True, False),
43
- ("insanely_fast_whisper", True, True, False)
44
  ]
45
  )
46
  def test_bgm_separation_with_vad_pipeline(
 
1
  from modules.utils.paths import *
2
  from modules.whisper.whisper_factory import WhisperFactory
3
+ from modules.whisper.data_classes import *
4
  from test_config import *
5
  from test_transcription import download_file, test_transcribe
6
 
 
17
  @pytest.mark.parametrize(
18
  "whisper_type,vad_filter,bgm_separation,diarization",
19
  [
20
+ (WhisperImpl.WHISPER.value, False, True, False),
21
+ (WhisperImpl.FASTER_WHISPER.value, False, True, False),
22
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, False, True, False)
23
  ]
24
  )
25
  def test_bgm_separation_pipeline(
 
38
  @pytest.mark.parametrize(
39
  "whisper_type,vad_filter,bgm_separation,diarization",
40
  [
41
+ (WhisperImpl.WHISPER.value, True, True, False),
42
+ (WhisperImpl.FASTER_WHISPER.value, True, True, False),
43
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, True, True, False)
44
  ]
45
  )
46
  def test_bgm_separation_with_vad_pipeline(
tests/test_config.py CHANGED
@@ -1,10 +1,14 @@
1
- from modules.utils.paths import *
2
-
3
  import os
4
  import torch
5
 
 
 
 
6
  TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
7
  TEST_FILE_PATH = os.path.join(WEBUI_DIR, "tests", "jfk.wav")
 
8
  TEST_YOUTUBE_URL = "https://www.youtube.com/watch?v=4WEQtgnBu0I&ab_channel=AndriaFitzer"
9
  TEST_WHISPER_MODEL = "tiny"
10
  TEST_UVR_MODEL = "UVR-MDX-NET-Inst_HQ_4"
@@ -13,5 +17,24 @@ TEST_SUBTITLE_SRT_PATH = os.path.join(WEBUI_DIR, "tests", "test_srt.srt")
13
  TEST_SUBTITLE_VTT_PATH = os.path.join(WEBUI_DIR, "tests", "test_vtt.vtt")
14
 
15
 
 
16
  def is_cuda_available():
17
  return torch.cuda.is_available()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import jiwer
3
  import os
4
  import torch
5
 
6
+ from modules.utils.paths import *
7
+ from modules.utils.youtube_manager import *
8
+
9
  TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
10
  TEST_FILE_PATH = os.path.join(WEBUI_DIR, "tests", "jfk.wav")
11
+ TEST_ANSWER = "And so my fellow Americans ask not what your country can do for you ask what you can do for your country"
12
  TEST_YOUTUBE_URL = "https://www.youtube.com/watch?v=4WEQtgnBu0I&ab_channel=AndriaFitzer"
13
  TEST_WHISPER_MODEL = "tiny"
14
  TEST_UVR_MODEL = "UVR-MDX-NET-Inst_HQ_4"
 
17
  TEST_SUBTITLE_VTT_PATH = os.path.join(WEBUI_DIR, "tests", "test_vtt.vtt")
18
 
19
 
20
+ @functools.lru_cache
21
  def is_cuda_available():
22
  return torch.cuda.is_available()
23
+
24
+
25
+ @functools.lru_cache
26
+ def is_pytube_detected_bot(url: str = TEST_YOUTUBE_URL):
27
+ try:
28
+ yt_temp_path = os.path.join("modules", "yt_tmp.wav")
29
+ if os.path.exists(yt_temp_path):
30
+ return False
31
+ yt = get_ytdata(url)
32
+ audio = get_ytaudio(yt)
33
+ return False
34
+ except Exception as e:
35
+ print(f"Pytube has detected as a bot: {e}")
36
+ return True
37
+
38
+
39
+ def calculate_wer(answer, prediction):
40
+ return jiwer.wer(answer, prediction)
tests/test_diarization.py CHANGED
@@ -1,6 +1,6 @@
1
  from modules.utils.paths import *
2
  from modules.whisper.whisper_factory import WhisperFactory
3
- from modules.whisper.whisper_parameter import WhisperValues
4
  from test_config import *
5
  from test_transcription import download_file, test_transcribe
6
 
@@ -16,9 +16,9 @@ import os
16
  @pytest.mark.parametrize(
17
  "whisper_type,vad_filter,bgm_separation,diarization",
18
  [
19
- ("whisper", False, False, True),
20
- ("faster-whisper", False, False, True),
21
- ("insanely_fast_whisper", False, False, True)
22
  ]
23
  )
24
  def test_diarization_pipeline(
 
1
  from modules.utils.paths import *
2
  from modules.whisper.whisper_factory import WhisperFactory
3
+ from modules.whisper.data_classes import *
4
  from test_config import *
5
  from test_transcription import download_file, test_transcribe
6
 
 
16
  @pytest.mark.parametrize(
17
  "whisper_type,vad_filter,bgm_separation,diarization",
18
  [
19
+ (WhisperImpl.WHISPER.value, False, False, True),
20
+ (WhisperImpl.FASTER_WHISPER.value, False, False, True),
21
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, True)
22
  ]
23
  )
24
  def test_diarization_pipeline(
tests/test_transcription.py CHANGED
@@ -1,5 +1,6 @@
1
  from modules.whisper.whisper_factory import WhisperFactory
2
- from modules.whisper.whisper_parameter import WhisperValues
 
3
  from modules.utils.paths import WEBUI_DIR
4
  from test_config import *
5
 
@@ -12,9 +13,9 @@ import os
12
  @pytest.mark.parametrize(
13
  "whisper_type,vad_filter,bgm_separation,diarization",
14
  [
15
- ("whisper", False, False, False),
16
- ("faster-whisper", False, False, False),
17
- ("insanely_fast_whisper", False, False, False)
18
  ]
19
  )
20
  def test_transcribe(
@@ -28,6 +29,10 @@ def test_transcribe(
28
  if not os.path.exists(audio_path):
29
  download_file(TEST_FILE_DOWNLOAD_URL, audio_path_dir)
30
 
 
 
 
 
31
  whisper_inferencer = WhisperFactory.create_whisper_inference(
32
  whisper_type=whisper_type,
33
  )
@@ -37,16 +42,24 @@ def test_transcribe(
37
  f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
38
  )
39
 
40
- hparams = WhisperValues(
41
- model_size=TEST_WHISPER_MODEL,
42
- vad_filter=vad_filter,
43
- is_bgm_separate=bgm_separation,
44
- compute_type=whisper_inferencer.current_compute_type,
45
- uvr_enable_offload=True,
46
- is_diarize=diarization,
47
- ).as_list()
48
-
49
- subtitle_str, file_path = whisper_inferencer.transcribe_file(
 
 
 
 
 
 
 
 
50
  [audio_path],
51
  None,
52
  "SRT",
@@ -54,29 +67,29 @@ def test_transcribe(
54
  gr.Progress(),
55
  *hparams,
56
  )
57
-
58
- assert isinstance(subtitle_str, str) and subtitle_str
59
- assert isinstance(file_path[0], str) and file_path
60
-
61
- whisper_inferencer.transcribe_youtube(
62
- TEST_YOUTUBE_URL,
63
- "SRT",
64
- False,
65
- gr.Progress(),
66
- *hparams,
67
- )
68
- assert isinstance(subtitle_str, str) and subtitle_str
69
- assert isinstance(file_path[0], str) and file_path
70
-
71
- whisper_inferencer.transcribe_mic(
72
  audio_path,
73
  "SRT",
74
  False,
75
  gr.Progress(),
76
  *hparams,
77
  )
78
- assert isinstance(subtitle_str, str) and subtitle_str
79
- assert isinstance(file_path[0], str) and file_path
80
 
81
 
82
  def download_file(url, save_dir):
 
1
  from modules.whisper.whisper_factory import WhisperFactory
2
+ from modules.whisper.data_classes import *
3
+ from modules.utils.subtitle_manager import read_file
4
  from modules.utils.paths import WEBUI_DIR
5
  from test_config import *
6
 
 
13
  @pytest.mark.parametrize(
14
  "whisper_type,vad_filter,bgm_separation,diarization",
15
  [
16
+ (WhisperImpl.WHISPER.value, False, False, False),
17
+ (WhisperImpl.FASTER_WHISPER.value, False, False, False),
18
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, False)
19
  ]
20
  )
21
  def test_transcribe(
 
29
  if not os.path.exists(audio_path):
30
  download_file(TEST_FILE_DOWNLOAD_URL, audio_path_dir)
31
 
32
+ answer = TEST_ANSWER
33
+ if diarization:
34
+ answer = "SPEAKER_00|"+TEST_ANSWER
35
+
36
  whisper_inferencer = WhisperFactory.create_whisper_inference(
37
  whisper_type=whisper_type,
38
  )
 
42
  f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
43
  )
44
 
45
+ hparams = TranscriptionPipelineParams(
46
+ whisper=WhisperParams(
47
+ model_size=TEST_WHISPER_MODEL,
48
+ compute_type=whisper_inferencer.current_compute_type
49
+ ),
50
+ vad=VadParams(
51
+ vad_filter=vad_filter
52
+ ),
53
+ bgm_separation=BGMSeparationParams(
54
+ is_separate_bgm=bgm_separation,
55
+ enable_offload=True
56
+ ),
57
+ diarization=DiarizationParams(
58
+ is_diarize=diarization
59
+ ),
60
+ ).to_list()
61
+
62
+ subtitle_str, file_paths = whisper_inferencer.transcribe_file(
63
  [audio_path],
64
  None,
65
  "SRT",
 
67
  gr.Progress(),
68
  *hparams,
69
  )
70
+ subtitle = read_file(file_paths[0]).split("\n")
71
+ assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
72
+
73
+ if not is_pytube_detected_bot():
74
+ subtitle_str, file_path = whisper_inferencer.transcribe_youtube(
75
+ TEST_YOUTUBE_URL,
76
+ "SRT",
77
+ False,
78
+ gr.Progress(),
79
+ *hparams,
80
+ )
81
+ assert isinstance(subtitle_str, str) and subtitle_str
82
+ assert os.path.exists(file_path)
83
+
84
+ subtitle_str, file_path = whisper_inferencer.transcribe_mic(
85
  audio_path,
86
  "SRT",
87
  False,
88
  gr.Progress(),
89
  *hparams,
90
  )
91
+ subtitle = read_file(file_path).split("\n")
92
+ assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
93
 
94
 
95
  def download_file(url, save_dir):
tests/test_translation.py CHANGED
@@ -28,6 +28,10 @@ def test_nllb_inference(
28
  assert isinstance(file_paths[0], str)
29
 
30
 
 
 
 
 
31
  @pytest.mark.parametrize("file_path", [
32
  TEST_SUBTITLE_SRT_PATH,
33
  TEST_SUBTITLE_VTT_PATH,
 
28
  assert isinstance(file_paths[0], str)
29
 
30
 
31
+ @pytest.mark.skipif(
32
+ os.getenv("DEEPL_API_KEY") is None or not os.getenv("DEEPL_API_KEY"),
33
+ reason="DeepL API key is unavailable"
34
+ )
35
  @pytest.mark.parametrize("file_path", [
36
  TEST_SUBTITLE_SRT_PATH,
37
  TEST_SUBTITLE_VTT_PATH,
tests/test_vad.py CHANGED
@@ -1,6 +1,6 @@
1
  from modules.utils.paths import *
2
  from modules.whisper.whisper_factory import WhisperFactory
3
- from modules.whisper.whisper_parameter import WhisperValues
4
  from test_config import *
5
  from test_transcription import download_file, test_transcribe
6
 
@@ -12,9 +12,9 @@ import os
12
  @pytest.mark.parametrize(
13
  "whisper_type,vad_filter,bgm_separation,diarization",
14
  [
15
- ("whisper", True, False, False),
16
- ("faster-whisper", True, False, False),
17
- ("insanely_fast_whisper", True, False, False)
18
  ]
19
  )
20
  def test_vad_pipeline(
 
1
  from modules.utils.paths import *
2
  from modules.whisper.whisper_factory import WhisperFactory
3
+ from modules.whisper.data_classes import *
4
  from test_config import *
5
  from test_transcription import download_file, test_transcribe
6
 
 
12
  @pytest.mark.parametrize(
13
  "whisper_type,vad_filter,bgm_separation,diarization",
14
  [
15
+ (WhisperImpl.WHISPER.value, True, False, False),
16
+ (WhisperImpl.FASTER_WHISPER.value, True, False, False),
17
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, True, False, False)
18
  ]
19
  )
20
  def test_vad_pipeline(