Spaces:
Running
Running
Merge branch 'master' into huggingface
Browse files- .dockerignore +7 -7
- .github/pull_request_template.md +4 -4
- .github/workflows/ci-shell.yml +3 -1
- .github/workflows/ci.yml +4 -2
- .gitignore +2 -1
- Install.bat +1 -0
- Install.sh +1 -0
- app.py +208 -310
- configs/default_parameters.yaml +1 -0
- configs/translation.yaml +459 -0
- modules/diarize/diarize_pipeline.py +6 -3
- modules/diarize/diarizer.py +15 -8
- modules/translation/deepl_api.py +18 -27
- modules/translation/nllb_inference.py +4 -5
- modules/translation/translation_base.py +28 -30
- modules/utils/constants.py +6 -0
- modules/utils/files_manager.py +6 -0
- modules/utils/paths.py +1 -0
- modules/utils/subtitle_manager.py +419 -115
- modules/vad/silero_vad.py +6 -5
- modules/whisper/{whisper_base.py → base_transcription_pipeline.py} +177 -139
- modules/whisper/data_classes.py +608 -0
- modules/whisper/faster_whisper_inference.py +8 -24
- modules/whisper/insanely_fast_whisper_inference.py +34 -34
- modules/whisper/whisper_Inference.py +28 -21
- modules/whisper/whisper_factory.py +8 -14
- modules/whisper/whisper_parameter.py +0 -369
- notebook/whisper-webui.ipynb +3 -1
- requirements.txt +4 -3
- screenshot.png +0 -0
- tests/test_bgm_separation.py +7 -7
- tests/test_config.py +25 -2
- tests/test_diarization.py +4 -4
- tests/test_transcription.py +44 -31
- tests/test_translation.py +4 -0
- tests/test_vad.py +4 -4
.dockerignore
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
# from .gitignore
|
2 |
-
venv/
|
3 |
-
ui/__pycache__/
|
4 |
-
outputs/
|
5 |
-
modules/__pycache__/
|
6 |
-
models/
|
7 |
modules/yt_tmp.wav
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
|
|
|
1 |
# from .gitignore
|
|
|
|
|
|
|
|
|
|
|
2 |
modules/yt_tmp.wav
|
3 |
+
**/venv/
|
4 |
+
**/__pycache__/
|
5 |
+
**/outputs/
|
6 |
+
**/models/
|
7 |
|
8 |
+
**/.idea
|
9 |
+
**/.git
|
10 |
+
**/.github
|
.github/pull_request_template.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
## Related issues
|
2 |
-
- #
|
3 |
|
4 |
-
##
|
5 |
-
1.
|
|
|
1 |
+
## Related issues / PRs
|
2 |
+
- #
|
3 |
|
4 |
+
## Summarize Changes
|
5 |
+
1.
|
.github/workflows/ci-shell.yml
CHANGED
@@ -6,9 +6,11 @@ on:
|
|
6 |
push:
|
7 |
branches:
|
8 |
- master
|
|
|
9 |
pull_request:
|
10 |
branches:
|
11 |
- master
|
|
|
12 |
|
13 |
jobs:
|
14 |
test-shell-script:
|
@@ -16,7 +18,7 @@ jobs:
|
|
16 |
runs-on: ubuntu-latest
|
17 |
strategy:
|
18 |
matrix:
|
19 |
-
python: [
|
20 |
|
21 |
steps:
|
22 |
- name: Clean up space for action
|
|
|
6 |
push:
|
7 |
branches:
|
8 |
- master
|
9 |
+
- intel-gpu
|
10 |
pull_request:
|
11 |
branches:
|
12 |
- master
|
13 |
+
- intel-gpu
|
14 |
|
15 |
jobs:
|
16 |
test-shell-script:
|
|
|
18 |
runs-on: ubuntu-latest
|
19 |
strategy:
|
20 |
matrix:
|
21 |
+
python: ["3.10", "3.11", "3.12"]
|
22 |
|
23 |
steps:
|
24 |
- name: Clean up space for action
|
.github/workflows/ci.yml
CHANGED
@@ -6,9 +6,11 @@ on:
|
|
6 |
push:
|
7 |
branches:
|
8 |
- master
|
|
|
9 |
pull_request:
|
10 |
branches:
|
11 |
- master
|
|
|
12 |
|
13 |
jobs:
|
14 |
build:
|
@@ -16,7 +18,7 @@ jobs:
|
|
16 |
runs-on: ubuntu-latest
|
17 |
strategy:
|
18 |
matrix:
|
19 |
-
python: ["3.10"]
|
20 |
|
21 |
env:
|
22 |
DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
|
@@ -35,7 +37,7 @@ jobs:
|
|
35 |
run: sudo apt-get update && sudo apt-get install -y git ffmpeg
|
36 |
|
37 |
- name: Install dependencies
|
38 |
-
run: pip install -r requirements.txt pytest
|
39 |
|
40 |
- name: Run test
|
41 |
run: python -m pytest -rs tests
|
|
|
6 |
push:
|
7 |
branches:
|
8 |
- master
|
9 |
+
- intel-gpu
|
10 |
pull_request:
|
11 |
branches:
|
12 |
- master
|
13 |
+
- intel-gpu
|
14 |
|
15 |
jobs:
|
16 |
build:
|
|
|
18 |
runs-on: ubuntu-latest
|
19 |
strategy:
|
20 |
matrix:
|
21 |
+
python: ["3.10", "3.11", "3.12"]
|
22 |
|
23 |
env:
|
24 |
DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
|
|
|
37 |
run: sudo apt-get update && sudo apt-get install -y git ffmpeg
|
38 |
|
39 |
- name: Install dependencies
|
40 |
+
run: pip install -r requirements.txt pytest jiwer
|
41 |
|
42 |
- name: Run test
|
43 |
run: python -m pytest -rs tests
|
.gitignore
CHANGED
@@ -10,4 +10,5 @@ outputs/
|
|
10 |
modules/__pycache__/
|
11 |
models/
|
12 |
modules/yt_tmp.wav
|
13 |
-
configs/default_parameters.yaml
|
|
|
|
10 |
modules/__pycache__/
|
11 |
models/
|
12 |
modules/yt_tmp.wav
|
13 |
+
configs/default_parameters.yaml
|
14 |
+
__pycache__/
|
Install.bat
CHANGED
@@ -8,6 +8,7 @@ echo checked the venv folder. now installing requirements..
|
|
8 |
|
9 |
call "%~dp0\venv\scripts\activate"
|
10 |
|
|
|
11 |
pip install -r requirements.txt
|
12 |
|
13 |
if errorlevel 1 (
|
|
|
8 |
|
9 |
call "%~dp0\venv\scripts\activate"
|
10 |
|
11 |
+
python -m pip install -U pip
|
12 |
pip install -r requirements.txt
|
13 |
|
14 |
if errorlevel 1 (
|
Install.sh
CHANGED
@@ -7,6 +7,7 @@ fi
|
|
7 |
|
8 |
source venv/bin/activate
|
9 |
|
|
|
10 |
pip install -r requirements.txt && echo "Requirements installed successfully." || {
|
11 |
echo ""
|
12 |
echo "Requirements installation failed. Please remove the venv folder and run the script again."
|
|
|
7 |
|
8 |
source venv/bin/activate
|
9 |
|
10 |
+
python -m pip install -U pip
|
11 |
pip install -r requirements.txt && echo "Requirements installed successfully." || {
|
12 |
echo ""
|
13 |
echo "Requirements installation failed. Please remove the venv folder and run the script again."
|
app.py
CHANGED
@@ -1,27 +1,27 @@
|
|
1 |
import os
|
2 |
import argparse
|
3 |
import gradio as gr
|
|
|
4 |
import yaml
|
5 |
|
6 |
from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
|
7 |
INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
8 |
-
UVR_MODELS_DIR)
|
9 |
from modules.utils.files_manager import load_yaml
|
10 |
from modules.whisper.whisper_factory import WhisperFactory
|
11 |
-
from modules.whisper.faster_whisper_inference import FasterWhisperInference
|
12 |
-
from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
|
13 |
from modules.translation.nllb_inference import NLLBInference
|
14 |
from modules.ui.htmls import *
|
15 |
from modules.utils.cli_manager import str2bool
|
16 |
from modules.utils.youtube_manager import get_ytmetas
|
17 |
from modules.translation.deepl_api import DeepLAPI
|
18 |
-
from modules.whisper.
|
19 |
|
20 |
|
21 |
class App:
|
22 |
def __init__(self, args):
|
23 |
self.args = args
|
24 |
self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
|
|
|
25 |
self.whisper_inf = WhisperFactory.create_whisper_inference(
|
26 |
whisper_type=self.args.whisper_type,
|
27 |
whisper_model_dir=self.args.whisper_model_dir,
|
@@ -38,10 +38,10 @@ class App:
|
|
38 |
output_dir=os.path.join(self.args.output_dir, "translations")
|
39 |
)
|
40 |
self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
41 |
-
print(f"Use \"{self.args.whisper_type}\" implementation"
|
42 |
-
|
43 |
|
44 |
-
def
|
45 |
whisper_params = self.default_params["whisper"]
|
46 |
vad_params = self.default_params["vad"]
|
47 |
diarization_params = self.default_params["diarization"]
|
@@ -49,158 +49,45 @@ class App:
|
|
49 |
|
50 |
with gr.Row():
|
51 |
dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
|
52 |
-
label="Model")
|
53 |
-
dd_lang = gr.Dropdown(choices=
|
54 |
-
value=whisper_params["lang"]
|
55 |
-
|
|
|
56 |
with gr.Row():
|
57 |
-
cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English?",
|
58 |
interactive=True)
|
59 |
with gr.Row():
|
60 |
-
cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"],
|
|
|
61 |
interactive=True)
|
62 |
|
63 |
-
with gr.Accordion("Advanced Parameters", open=False):
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
minimum=0, maximum=1, step=0.01, interactive=True,
|
83 |
-
info="Resets prompt if temperature is above this value."
|
84 |
-
" Arg has effect only if 'Condition On Previous Text' is True.")
|
85 |
-
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
|
86 |
-
info="Initial prompt to use for decoding.")
|
87 |
-
sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
|
88 |
-
step=0.01, maximum=1.0, interactive=True,
|
89 |
-
info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
|
90 |
-
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
|
91 |
-
interactive=True,
|
92 |
-
info="If the gzip compression ratio is above this value, treat as failed.")
|
93 |
-
nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
|
94 |
-
precision=0,
|
95 |
-
info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
|
96 |
-
with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
97 |
-
nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
|
98 |
-
info="Exponential length penalty constant.")
|
99 |
-
nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"],
|
100 |
-
info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
|
101 |
-
nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"],
|
102 |
-
precision=0,
|
103 |
-
info="Prevent repetitions of n-grams with this size (set 0 to disable).")
|
104 |
-
tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
|
105 |
-
info="Optional text to provide as a prefix for the first window.")
|
106 |
-
cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=whisper_params["suppress_blank"],
|
107 |
-
info="Suppress blank outputs at the beginning of the sampling.")
|
108 |
-
tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
|
109 |
-
info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
|
110 |
-
nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"],
|
111 |
-
info="The initial timestamp cannot be later than this.")
|
112 |
-
cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
|
113 |
-
info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
|
114 |
-
tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"],
|
115 |
-
info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
|
116 |
-
tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"],
|
117 |
-
info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
|
118 |
-
nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
|
119 |
-
precision=0,
|
120 |
-
info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
|
121 |
-
nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
|
122 |
-
value=lambda: whisper_params["hallucination_silence_threshold"],
|
123 |
-
info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
|
124 |
-
tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
|
125 |
-
info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
|
126 |
-
nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"],
|
127 |
-
info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
|
128 |
-
nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"],
|
129 |
-
precision=0,
|
130 |
-
info="Number of segments to consider for the language detection.")
|
131 |
-
with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
|
132 |
-
nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
|
133 |
-
|
134 |
-
with gr.Accordion("Background Music Remover Filter", open=False):
|
135 |
-
cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
|
136 |
-
interactive=True,
|
137 |
-
info="Enabling this will remove background music by submodel before"
|
138 |
-
" transcribing ")
|
139 |
-
dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
|
140 |
-
choices=self.whisper_inf.music_separator.available_devices)
|
141 |
-
dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
|
142 |
-
choices=self.whisper_inf.music_separator.available_models)
|
143 |
-
nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
|
144 |
-
cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
|
145 |
-
cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
|
146 |
-
value=uvr_params["enable_offload"])
|
147 |
-
|
148 |
-
with gr.Accordion("Voice Detection Filter", open=False):
|
149 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
|
150 |
-
interactive=True,
|
151 |
-
info="Enable this to transcribe only detected voice parts by submodel.")
|
152 |
-
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
|
153 |
-
value=vad_params["threshold"],
|
154 |
-
info="Lower it to be more sensitive to small sounds.")
|
155 |
-
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
|
156 |
-
value=vad_params["min_speech_duration_ms"],
|
157 |
-
info="Final speech chunks shorter than this time are thrown out")
|
158 |
-
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
|
159 |
-
value=vad_params["max_speech_duration_s"],
|
160 |
-
info="Maximum duration of speech chunks in \"seconds\".")
|
161 |
-
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
|
162 |
-
value=vad_params["min_silence_duration_ms"],
|
163 |
-
info="In the end of each speech chunk wait for this time"
|
164 |
-
" before separating it")
|
165 |
-
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
|
166 |
-
info="Final speech chunks are padded by this time each side")
|
167 |
-
|
168 |
-
with gr.Accordion("Diarization", open=False):
|
169 |
-
cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
|
170 |
-
tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
|
171 |
-
info="This is only needed the first time you download the model. If you already have"
|
172 |
-
" models, you don't need to enter. To download the model, you must manually go "
|
173 |
-
"to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to"
|
174 |
-
" their requirement.")
|
175 |
-
dd_diarization_device = gr.Dropdown(label="Device",
|
176 |
-
choices=self.whisper_inf.diarizer.get_available_device(),
|
177 |
-
value=self.whisper_inf.diarizer.get_device())
|
178 |
|
179 |
dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
|
180 |
|
|
|
|
|
181 |
return (
|
182 |
-
|
183 |
-
model_size=dd_model, lang=dd_lang, is_translate=cb_translate, beam_size=nb_beam_size,
|
184 |
-
log_prob_threshold=nb_log_prob_threshold, no_speech_threshold=nb_no_speech_threshold,
|
185 |
-
compute_type=dd_compute_type, best_of=nb_best_of, patience=nb_patience,
|
186 |
-
condition_on_previous_text=cb_condition_on_previous_text, initial_prompt=tb_initial_prompt,
|
187 |
-
temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
|
188 |
-
vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
|
189 |
-
max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
|
190 |
-
speech_pad_ms=nb_speech_pad_ms, chunk_length=nb_chunk_length, batch_size=nb_batch_size,
|
191 |
-
is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
|
192 |
-
length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
|
193 |
-
no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
|
194 |
-
suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
|
195 |
-
word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
|
196 |
-
append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens,
|
197 |
-
hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
|
198 |
-
language_detection_threshold=nb_language_detection_threshold,
|
199 |
-
language_detection_segments=nb_language_detection_segments,
|
200 |
-
prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
|
201 |
-
uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
|
202 |
-
uvr_save_file=cb_uvr_save_file, uvr_enable_offload=cb_uvr_enable_offload
|
203 |
-
),
|
204 |
dd_file_format,
|
205 |
cb_timestamp
|
206 |
)
|
@@ -212,185 +99,194 @@ class App:
|
|
212 |
uvr_params = self.default_params["bgm_separation"]
|
213 |
|
214 |
with self.app:
|
215 |
-
with
|
216 |
-
with gr.
|
217 |
-
gr.Markdown(MARKDOWN, elem_id="md_project")
|
218 |
-
with gr.Tabs():
|
219 |
-
with gr.TabItem("File"): # tab1
|
220 |
with gr.Column():
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
" Leave this field empty if you do not wish to use a local path.",
|
225 |
-
visible=self.args.colab,
|
226 |
-
value="")
|
227 |
-
|
228 |
-
whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
|
229 |
-
|
230 |
-
with gr.Row():
|
231 |
-
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
232 |
-
with gr.Row():
|
233 |
-
tb_indicator = gr.Textbox(label="Output", scale=5)
|
234 |
-
files_subtitles = gr.Files(label="Downloadable output file", scale=3, interactive=False)
|
235 |
-
btn_openfolder = gr.Button('📂', scale=1)
|
236 |
-
|
237 |
-
params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
|
238 |
-
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
239 |
-
inputs=params + whisper_params.as_list(),
|
240 |
-
outputs=[tb_indicator, files_subtitles])
|
241 |
-
btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
|
242 |
-
|
243 |
-
with gr.TabItem("Youtube"): # tab2
|
244 |
-
with gr.Row():
|
245 |
-
tb_youtubelink = gr.Textbox(label="Youtube Link")
|
246 |
-
with gr.Row(equal_height=True):
|
247 |
with gr.Column():
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
|
|
252 |
|
253 |
-
|
254 |
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
|
262 |
-
|
|
|
|
|
|
|
|
|
263 |
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
270 |
|
271 |
-
|
272 |
-
with gr.Row():
|
273 |
-
mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
|
274 |
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
276 |
|
277 |
-
|
278 |
-
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
279 |
-
with gr.Row():
|
280 |
-
tb_indicator = gr.Textbox(label="Output", scale=5)
|
281 |
-
files_subtitles = gr.Files(label="Downloadable output file", scale=3)
|
282 |
-
btn_openfolder = gr.Button('📂', scale=1)
|
283 |
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
|
290 |
|
291 |
-
|
292 |
-
with gr.Row():
|
293 |
-
file_subs = gr.Files(type="filepath", label="Upload Subtitle Files to translate here",
|
294 |
-
file_types=['.vtt', '.srt'])
|
295 |
|
296 |
-
with gr.TabItem("DeepL API"): # sub tab1
|
297 |
with gr.Row():
|
298 |
-
|
299 |
with gr.Row():
|
300 |
-
|
301 |
-
|
302 |
-
self.deepl_api.available_source_langs.keys()))
|
303 |
-
dd_target_lang = gr.Dropdown(label="Target Language", value=deepl_params["target_lang"],
|
304 |
-
choices=list(self.deepl_api.available_target_langs.keys()))
|
305 |
-
with gr.Row():
|
306 |
-
cb_is_pro = gr.Checkbox(label="Pro User?", value=deepl_params["is_pro"])
|
307 |
-
with gr.Row():
|
308 |
-
cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
|
309 |
-
interactive=True)
|
310 |
-
with gr.Row():
|
311 |
-
btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
|
312 |
-
with gr.Row():
|
313 |
-
tb_indicator = gr.Textbox(label="Output", scale=5)
|
314 |
-
files_subtitles = gr.Files(label="Downloadable output file", scale=3)
|
315 |
btn_openfolder = gr.Button('📂', scale=1)
|
316 |
|
317 |
-
|
318 |
-
inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
|
319 |
-
cb_is_pro, cb_timestamp],
|
320 |
-
outputs=[tb_indicator, files_subtitles])
|
321 |
|
322 |
-
|
323 |
-
|
324 |
-
|
|
|
325 |
|
326 |
-
with gr.TabItem("
|
327 |
-
with gr.Row():
|
328 |
-
dd_model_size = gr.Dropdown(label="Model", value=nllb_params["model_size"],
|
329 |
-
choices=self.nllb_inf.available_models)
|
330 |
-
dd_source_lang = gr.Dropdown(label="Source Language", value=nllb_params["source_lang"],
|
331 |
-
choices=self.nllb_inf.available_source_langs)
|
332 |
-
dd_target_lang = gr.Dropdown(label="Target Language", value=nllb_params["target_lang"],
|
333 |
-
choices=self.nllb_inf.available_target_langs)
|
334 |
-
with gr.Row():
|
335 |
-
nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
|
336 |
-
precision=0)
|
337 |
-
with gr.Row():
|
338 |
-
cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
|
339 |
-
interactive=True)
|
340 |
with gr.Row():
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
with gr.Column():
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
with gr.Column():
|
369 |
-
with gr.Row():
|
370 |
-
ad_instrumental = gr.Audio(label="Instrumental", scale=8)
|
371 |
-
btn_open_instrumental_folder = gr.Button('📂', scale=1)
|
372 |
-
with gr.Row():
|
373 |
-
ad_vocals = gr.Audio(label="Vocals", scale=8)
|
374 |
-
btn_open_vocals_folder = gr.Button('📂', scale=1)
|
375 |
-
|
376 |
-
btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
|
377 |
-
inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
|
378 |
-
cb_uvr_save_file],
|
379 |
-
outputs=[ad_instrumental, ad_vocals])
|
380 |
-
btn_open_instrumental_folder.click(inputs=None,
|
381 |
-
outputs=None,
|
382 |
-
fn=lambda: self.open_folder(os.path.join(
|
383 |
-
self.args.output_dir, "UVR", "instrumental"
|
384 |
-
)))
|
385 |
-
btn_open_vocals_folder.click(inputs=None,
|
386 |
-
outputs=None,
|
387 |
-
fn=lambda: self.open_folder(os.path.join(
|
388 |
-
self.args.output_dir, "UVR", "vocals"
|
389 |
-
)))
|
390 |
|
391 |
# Launch the app with optional gradio settings
|
392 |
args = self.args
|
393 |
-
|
394 |
self.app.queue(
|
395 |
api_open=args.api_open
|
396 |
).launch(
|
@@ -419,10 +315,10 @@ class App:
|
|
419 |
return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
|
420 |
|
421 |
|
422 |
-
# Create the parser for command-line arguments
|
423 |
parser = argparse.ArgumentParser()
|
424 |
-
parser.add_argument('--whisper_type', type=str, default=
|
425 |
-
|
|
|
426 |
parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
|
427 |
parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
|
428 |
parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
|
@@ -431,8 +327,10 @@ parser.add_argument('--username', type=str, default=None, help='Gradio authentic
|
|
431 |
parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
|
432 |
parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
|
433 |
parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
|
434 |
-
parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True,
|
435 |
-
|
|
|
|
|
436 |
parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
|
437 |
help='Directory path of the whisper model')
|
438 |
parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
|
|
|
1 |
import os
|
2 |
import argparse
|
3 |
import gradio as gr
|
4 |
+
from gradio_i18n import Translate, gettext as _
|
5 |
import yaml
|
6 |
|
7 |
from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
|
8 |
INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
9 |
+
UVR_MODELS_DIR, I18N_YAML_PATH)
|
10 |
from modules.utils.files_manager import load_yaml
|
11 |
from modules.whisper.whisper_factory import WhisperFactory
|
|
|
|
|
12 |
from modules.translation.nllb_inference import NLLBInference
|
13 |
from modules.ui.htmls import *
|
14 |
from modules.utils.cli_manager import str2bool
|
15 |
from modules.utils.youtube_manager import get_ytmetas
|
16 |
from modules.translation.deepl_api import DeepLAPI
|
17 |
+
from modules.whisper.data_classes import *
|
18 |
|
19 |
|
20 |
class App:
|
21 |
def __init__(self, args):
|
22 |
self.args = args
|
23 |
self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
|
24 |
+
self.i18n = Translate(I18N_YAML_PATH)
|
25 |
self.whisper_inf = WhisperFactory.create_whisper_inference(
|
26 |
whisper_type=self.args.whisper_type,
|
27 |
whisper_model_dir=self.args.whisper_model_dir,
|
|
|
38 |
output_dir=os.path.join(self.args.output_dir, "translations")
|
39 |
)
|
40 |
self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
41 |
+
print(f"Use \"{self.args.whisper_type}\" implementation\n"
|
42 |
+
f"Device \"{self.whisper_inf.device}\" is detected")
|
43 |
|
44 |
+
def create_pipeline_inputs(self):
|
45 |
whisper_params = self.default_params["whisper"]
|
46 |
vad_params = self.default_params["vad"]
|
47 |
diarization_params = self.default_params["diarization"]
|
|
|
49 |
|
50 |
with gr.Row():
|
51 |
dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
|
52 |
+
label=_("Model"))
|
53 |
+
dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
|
54 |
+
value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
|
55 |
+
else whisper_params["lang"], label=_("Language"))
|
56 |
+
dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt", "LRC"], value=whisper_params["file_format"], label=_("File Format"))
|
57 |
with gr.Row():
|
58 |
+
cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
|
59 |
interactive=True)
|
60 |
with gr.Row():
|
61 |
+
cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"],
|
62 |
+
label=_("Add a timestamp to the end of the filename"),
|
63 |
interactive=True)
|
64 |
|
65 |
+
with gr.Accordion(_("Advanced Parameters"), open=False):
|
66 |
+
whisper_inputs = WhisperParams.to_gradio_inputs(defaults=whisper_params, only_advanced=True,
|
67 |
+
whisper_type=self.args.whisper_type,
|
68 |
+
available_compute_types=self.whisper_inf.available_compute_types,
|
69 |
+
compute_type=self.whisper_inf.current_compute_type)
|
70 |
+
|
71 |
+
with gr.Accordion(_("Background Music Remover Filter"), open=False):
|
72 |
+
uvr_inputs = BGMSeparationParams.to_gradio_input(defaults=uvr_params,
|
73 |
+
available_models=self.whisper_inf.music_separator.available_models,
|
74 |
+
available_devices=self.whisper_inf.music_separator.available_devices,
|
75 |
+
device=self.whisper_inf.music_separator.device)
|
76 |
+
|
77 |
+
with gr.Accordion(_("Voice Detection Filter"), open=False):
|
78 |
+
vad_inputs = VadParams.to_gradio_inputs(defaults=vad_params)
|
79 |
+
|
80 |
+
with gr.Accordion(_("Diarization"), open=False):
|
81 |
+
diarization_inputs = DiarizationParams.to_gradio_inputs(defaults=diarization_params,
|
82 |
+
available_devices=self.whisper_inf.diarizer.available_device,
|
83 |
+
device=self.whisper_inf.diarizer.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
|
86 |
|
87 |
+
pipeline_inputs = [dd_model, dd_lang, cb_translate] + whisper_inputs + vad_inputs + diarization_inputs + uvr_inputs
|
88 |
+
|
89 |
return (
|
90 |
+
pipeline_inputs,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
dd_file_format,
|
92 |
cb_timestamp
|
93 |
)
|
|
|
99 |
uvr_params = self.default_params["bgm_separation"]
|
100 |
|
101 |
with self.app:
|
102 |
+
with self.i18n:
|
103 |
+
with gr.Row():
|
|
|
|
|
|
|
104 |
with gr.Column():
|
105 |
+
gr.Markdown(MARKDOWN, elem_id="md_project")
|
106 |
+
with gr.Tabs():
|
107 |
+
with gr.TabItem(_("File")): # tab1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
with gr.Column():
|
109 |
+
input_file = gr.Files(type="filepath", label=_("Upload File here"))
|
110 |
+
tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
|
111 |
+
info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
|
112 |
+
" Leave this field empty if you do not wish to use a local path.",
|
113 |
+
visible=self.args.colab,
|
114 |
+
value="")
|
115 |
|
116 |
+
pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
|
117 |
|
118 |
+
with gr.Row():
|
119 |
+
btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
|
120 |
+
with gr.Row():
|
121 |
+
tb_indicator = gr.Textbox(label=_("Output"), scale=5)
|
122 |
+
files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3, interactive=False)
|
123 |
+
btn_openfolder = gr.Button('📂', scale=1)
|
124 |
|
125 |
+
params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
|
126 |
+
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
127 |
+
inputs=params + pipeline_params,
|
128 |
+
outputs=[tb_indicator, files_subtitles])
|
129 |
+
btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
|
130 |
|
131 |
+
with gr.TabItem(_("Youtube")): # tab2
|
132 |
+
with gr.Row():
|
133 |
+
tb_youtubelink = gr.Textbox(label=_("Youtube Link"))
|
134 |
+
with gr.Row(equal_height=True):
|
135 |
+
with gr.Column():
|
136 |
+
img_thumbnail = gr.Image(label=_("Youtube Thumbnail"))
|
137 |
+
with gr.Column():
|
138 |
+
tb_title = gr.Label(label=_("Youtube Title"))
|
139 |
+
tb_description = gr.Textbox(label=_("Youtube Description"), max_lines=15)
|
140 |
|
141 |
+
pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
|
|
|
|
|
142 |
|
143 |
+
with gr.Row():
|
144 |
+
btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
|
145 |
+
with gr.Row():
|
146 |
+
tb_indicator = gr.Textbox(label=_("Output"), scale=5)
|
147 |
+
files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
|
148 |
+
btn_openfolder = gr.Button('📂', scale=1)
|
149 |
|
150 |
+
params = [tb_youtubelink, dd_file_format, cb_timestamp]
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
+
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
153 |
+
inputs=params + pipeline_params,
|
154 |
+
outputs=[tb_indicator, files_subtitles])
|
155 |
+
tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
|
156 |
+
outputs=[img_thumbnail, tb_title, tb_description])
|
157 |
+
btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
|
158 |
|
159 |
+
with gr.TabItem(_("Mic")): # tab3
|
160 |
+
with gr.Row():
|
161 |
+
mic_input = gr.Microphone(label=_("Record with Mic"), type="filepath", interactive=True)
|
|
|
162 |
|
163 |
+
pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
|
|
|
|
|
|
|
164 |
|
|
|
165 |
with gr.Row():
|
166 |
+
btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
|
167 |
with gr.Row():
|
168 |
+
tb_indicator = gr.Textbox(label=_("Output"), scale=5)
|
169 |
+
files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
btn_openfolder = gr.Button('📂', scale=1)
|
171 |
|
172 |
+
params = [mic_input, dd_file_format, cb_timestamp]
|
|
|
|
|
|
|
173 |
|
174 |
+
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
175 |
+
inputs=params + pipeline_params,
|
176 |
+
outputs=[tb_indicator, files_subtitles])
|
177 |
+
btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
|
178 |
|
179 |
+
with gr.TabItem(_("T2T Translation")): # tab 4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
with gr.Row():
|
181 |
+
file_subs = gr.Files(type="filepath", label=_("Upload Subtitle Files to translate here"))
|
182 |
+
|
183 |
+
with gr.TabItem(_("DeepL API")): # sub tab1
|
184 |
+
with gr.Row():
|
185 |
+
tb_api_key = gr.Textbox(label=_("Your Auth Key (API KEY)"),
|
186 |
+
value=deepl_params["api_key"])
|
187 |
+
with gr.Row():
|
188 |
+
dd_source_lang = gr.Dropdown(label=_("Source Language"),
|
189 |
+
value=AUTOMATIC_DETECTION if deepl_params["source_lang"] == AUTOMATIC_DETECTION.unwrap()
|
190 |
+
else deepl_params["source_lang"],
|
191 |
+
choices=list(self.deepl_api.available_source_langs.keys()))
|
192 |
+
dd_target_lang = gr.Dropdown(label=_("Target Language"),
|
193 |
+
value=deepl_params["target_lang"],
|
194 |
+
choices=list(self.deepl_api.available_target_langs.keys()))
|
195 |
+
with gr.Row():
|
196 |
+
cb_is_pro = gr.Checkbox(label=_("Pro User?"), value=deepl_params["is_pro"])
|
197 |
+
with gr.Row():
|
198 |
+
cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
|
199 |
+
label=_("Add a timestamp to the end of the filename"),
|
200 |
+
interactive=True)
|
201 |
+
with gr.Row():
|
202 |
+
btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
|
203 |
+
with gr.Row():
|
204 |
+
tb_indicator = gr.Textbox(label=_("Output"), scale=5)
|
205 |
+
files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
|
206 |
+
btn_openfolder = gr.Button('📂', scale=1)
|
207 |
+
|
208 |
+
btn_run.click(fn=self.deepl_api.translate_deepl,
|
209 |
+
inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
|
210 |
+
cb_is_pro, cb_timestamp],
|
211 |
+
outputs=[tb_indicator, files_subtitles])
|
212 |
+
|
213 |
+
btn_openfolder.click(
|
214 |
+
fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
|
215 |
+
inputs=None,
|
216 |
+
outputs=None)
|
217 |
+
|
218 |
+
with gr.TabItem(_("NLLB")): # sub tab2
|
219 |
+
with gr.Row():
|
220 |
+
dd_model_size = gr.Dropdown(label=_("Model"), value=nllb_params["model_size"],
|
221 |
+
choices=self.nllb_inf.available_models)
|
222 |
+
dd_source_lang = gr.Dropdown(label=_("Source Language"),
|
223 |
+
value=nllb_params["source_lang"],
|
224 |
+
choices=self.nllb_inf.available_source_langs)
|
225 |
+
dd_target_lang = gr.Dropdown(label=_("Target Language"),
|
226 |
+
value=nllb_params["target_lang"],
|
227 |
+
choices=self.nllb_inf.available_target_langs)
|
228 |
+
with gr.Row():
|
229 |
+
nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
|
230 |
+
precision=0)
|
231 |
+
with gr.Row():
|
232 |
+
cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
|
233 |
+
label=_("Add a timestamp to the end of the filename"),
|
234 |
+
interactive=True)
|
235 |
+
with gr.Row():
|
236 |
+
btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
|
237 |
+
with gr.Row():
|
238 |
+
tb_indicator = gr.Textbox(label=_("Output"), scale=5)
|
239 |
+
files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
|
240 |
+
btn_openfolder = gr.Button('📂', scale=1)
|
241 |
+
with gr.Column():
|
242 |
+
md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
|
243 |
+
|
244 |
+
btn_run.click(fn=self.nllb_inf.translate_file,
|
245 |
+
inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
|
246 |
+
nb_max_length, cb_timestamp],
|
247 |
+
outputs=[tb_indicator, files_subtitles])
|
248 |
+
|
249 |
+
btn_openfolder.click(
|
250 |
+
fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
|
251 |
+
inputs=None,
|
252 |
+
outputs=None)
|
253 |
+
|
254 |
+
with gr.TabItem(_("BGM Separation")):
|
255 |
+
files_audio = gr.Files(type="filepath", label=_("Upload Audio Files to separate background music"))
|
256 |
+
dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
|
257 |
+
choices=self.whisper_inf.music_separator.available_devices)
|
258 |
+
dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"],
|
259 |
+
choices=self.whisper_inf.music_separator.available_models)
|
260 |
+
nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"],
|
261 |
+
precision=0)
|
262 |
+
cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"),
|
263 |
+
value=True, visible=False)
|
264 |
+
btn_run = gr.Button(_("SEPARATE BACKGROUND MUSIC"), variant="primary")
|
265 |
with gr.Column():
|
266 |
+
with gr.Row():
|
267 |
+
ad_instrumental = gr.Audio(label=_("Instrumental"), scale=8)
|
268 |
+
btn_open_instrumental_folder = gr.Button('📂', scale=1)
|
269 |
+
with gr.Row():
|
270 |
+
ad_vocals = gr.Audio(label=_("Vocals"), scale=8)
|
271 |
+
btn_open_vocals_folder = gr.Button('📂', scale=1)
|
272 |
+
|
273 |
+
btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
|
274 |
+
inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
|
275 |
+
cb_uvr_save_file],
|
276 |
+
outputs=[ad_instrumental, ad_vocals])
|
277 |
+
btn_open_instrumental_folder.click(inputs=None,
|
278 |
+
outputs=None,
|
279 |
+
fn=lambda: self.open_folder(os.path.join(
|
280 |
+
self.args.output_dir, "UVR", "instrumental"
|
281 |
+
)))
|
282 |
+
btn_open_vocals_folder.click(inputs=None,
|
283 |
+
outputs=None,
|
284 |
+
fn=lambda: self.open_folder(os.path.join(
|
285 |
+
self.args.output_dir, "UVR", "vocals"
|
286 |
+
)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
# Launch the app with optional gradio settings
|
289 |
args = self.args
|
|
|
290 |
self.app.queue(
|
291 |
api_open=args.api_open
|
292 |
).launch(
|
|
|
315 |
return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
|
316 |
|
317 |
|
|
|
318 |
parser = argparse.ArgumentParser()
|
319 |
+
parser.add_argument('--whisper_type', type=str, default=WhisperImpl.FASTER_WHISPER.value,
|
320 |
+
choices=[item.value for item in WhisperImpl],
|
321 |
+
help='A type of the whisper implementation (Github repo name)')
|
322 |
parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
|
323 |
parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
|
324 |
parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
|
|
|
327 |
parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
|
328 |
parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
|
329 |
parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
|
330 |
+
parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True,
|
331 |
+
help='Enable api or not in Gradio')
|
332 |
+
parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True,
|
333 |
+
help='Whether to automatically start Gradio app or not')
|
334 |
parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
|
335 |
help='Directory path of the whisper model')
|
336 |
parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
|
configs/default_parameters.yaml
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
whisper:
|
2 |
model_size: "large-v2"
|
|
|
3 |
lang: "Automatic Detection"
|
4 |
is_translate: false
|
5 |
beam_size: 5
|
|
|
1 |
whisper:
|
2 |
model_size: "large-v2"
|
3 |
+
file_format: "SRT"
|
4 |
lang: "Automatic Detection"
|
5 |
is_translate: false
|
6 |
beam_size: 5
|
configs/translation.yaml
ADDED
@@ -0,0 +1,459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
en: # English
|
2 |
+
Language: Language
|
3 |
+
File: File
|
4 |
+
Youtube: Youtube
|
5 |
+
Mic: Mic
|
6 |
+
T2T Translation: T2T Translation
|
7 |
+
BGM Separation: BGM Separation
|
8 |
+
GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
|
9 |
+
Output: Output
|
10 |
+
Downloadable output file: Downloadable output file
|
11 |
+
Upload File here: Upload File here
|
12 |
+
Model: Model
|
13 |
+
Automatic Detection: Automatic Detection
|
14 |
+
File Format: File Format
|
15 |
+
Translate to English?: Translate to English?
|
16 |
+
Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
|
17 |
+
Advanced Parameters: Advanced Parameters
|
18 |
+
Background Music Remover Filter: Background Music Remover Filter
|
19 |
+
Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
|
20 |
+
Enable Background Music Remover Filter: Enable Background Music Remover Filter
|
21 |
+
Save separated files to output: Save separated files to output
|
22 |
+
Offload sub model after removing background music: Offload sub model after removing background music
|
23 |
+
Voice Detection Filter: Voice Detection Filter
|
24 |
+
Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
|
25 |
+
Enable Silero VAD Filter: Enable Silero VAD Filter
|
26 |
+
Diarization: Diarization
|
27 |
+
Enable Diarization: Enable Diarization
|
28 |
+
HuggingFace Token: HuggingFace Token
|
29 |
+
This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
|
30 |
+
Device: Device
|
31 |
+
Youtube Link: Youtube Link
|
32 |
+
Youtube Thumbnail: Youtube Thumbnail
|
33 |
+
Youtube Title: Youtube Title
|
34 |
+
Youtube Description: Youtube Description
|
35 |
+
Record with Mic: Record with Mic
|
36 |
+
Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
|
37 |
+
Your Auth Key (API KEY): Your Auth Key (API KEY)
|
38 |
+
Source Language: Source Language
|
39 |
+
Target Language: Target Language
|
40 |
+
Pro User?: Pro User?
|
41 |
+
TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
|
42 |
+
Upload Audio Files to separate background music: Upload Audio Files to separate background music
|
43 |
+
Instrumental: Instrumental
|
44 |
+
Vocals: Vocals
|
45 |
+
SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
|
46 |
+
|
47 |
+
ko: # Korean
|
48 |
+
Language: 언어
|
49 |
+
File: 파일
|
50 |
+
Youtube: 유튜브
|
51 |
+
Mic: 마이크
|
52 |
+
T2T Translation: T2T 자막 번역
|
53 |
+
BGM Separation: 배경 음악 분리
|
54 |
+
GENERATE SUBTITLE FILE: 자막 파일 생성
|
55 |
+
Output: 결과물
|
56 |
+
Downloadable output file: 결과물 파일 다운로드
|
57 |
+
Upload File here: 파일을 업로드 하세요
|
58 |
+
Model: 모델
|
59 |
+
Automatic Detection: 자동 감지
|
60 |
+
File Format: 파일 형식
|
61 |
+
Translate to English?: 영어로 번역합니까? (위스퍼 모델 자체 번역 기능)
|
62 |
+
Add a timestamp to the end of the filename: 파일 이름 끝에 타임스태프 붙이기
|
63 |
+
Advanced Parameters: 고급 변수
|
64 |
+
Background Music Remover Filter: 배경 음악 제거 필터
|
65 |
+
Enabling this will remove background music: 받아쓰기 이전에 먼저 배경 음악 제거용 서브 모델을 활성화 합니다.
|
66 |
+
Enable Background Music Remover Filter: 배경 음악 제거 필터 활성화
|
67 |
+
Save separated files to output: 분리된 배경 음악 & 음성 파일 따로 출력 폴더에 저장
|
68 |
+
Offload sub model after removing background music: 배경 음악 제거 후 서브 모델을 비활성화 합니다. (VRAM 이 부족할 시 체크하세요.)
|
69 |
+
Voice Detection Filter: 목소리 감지 필터
|
70 |
+
Enable this to transcribe only detected voice: 서브 모델에 의해 목소리라고 판단된 부분만 받아쓰기를 진행합니다.
|
71 |
+
Enable Silero VAD Filter: Silero VAD 필터 활성화
|
72 |
+
Diarization: 화자 구분
|
73 |
+
Enable Diarization: 화자 구분 활성화
|
74 |
+
HuggingFace Token: 허깅페이스 토큰
|
75 |
+
This is only needed the first time you download the model: 모델을 처음 다운받을 때만 토큰이 필요합니다. 이미 다운로드 받으신 상태라면 입력하지 않아도 됩니다. 모델을 다운 받기 위해선 "https://huggingface.co/pyannote/speaker-diarization-3.1" 와 "https://huggingface.co/pyannote/segmentation-3.0" 에서 먼저 사용 지침에 동의하셔야 합니다.
|
76 |
+
Device: 디바이스
|
77 |
+
Youtube Link: 유튜브 링크
|
78 |
+
Youtube Thumbnail: 유튜브 썸네일
|
79 |
+
Youtube Title: 유튜브 제목
|
80 |
+
Youtube Description: 유튜브 설명
|
81 |
+
Record with Mic: 마이크로 녹음하세요
|
82 |
+
Upload Subtitle Files to translate here: 번역할 자막 파일을 업로드 하세요
|
83 |
+
Your Auth Key (API KEY): DeepL API 키
|
84 |
+
Source Language: 원본 언어
|
85 |
+
Target Language: 대상 언어
|
86 |
+
Pro User?: Pro 버전 사용자
|
87 |
+
TRANSLATE SUBTITLE FILE: 자막 파일 번역
|
88 |
+
Upload Audio Files to separate background music: 배경 음악을 분리할 오디오 파일을 업로드 하세요
|
89 |
+
Instrumental: 악기
|
90 |
+
Vocals: 보컬
|
91 |
+
SEPARATE BACKGROUND MUSIC: 배경 음악 분리
|
92 |
+
|
93 |
+
ja: # Japanese
|
94 |
+
Language: 言語
|
95 |
+
File: File
|
96 |
+
Youtube: Youtube
|
97 |
+
Mic: Mic
|
98 |
+
T2T Translation: T2T Translation
|
99 |
+
BGM Separation: BGM Separation
|
100 |
+
GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
|
101 |
+
Output: Output
|
102 |
+
Downloadable output file: Downloadable output file
|
103 |
+
Upload File here: Upload File here
|
104 |
+
Model: Model
|
105 |
+
Automatic Detection: Automatic Detection
|
106 |
+
File Format: File Format
|
107 |
+
Translate to English?: Translate to English?
|
108 |
+
Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
|
109 |
+
Advanced Parameters: Advanced Parameters
|
110 |
+
Background Music Remover Filter: Background Music Remover Filter
|
111 |
+
Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
|
112 |
+
Enable Background Music Remover Filter: Enable Background Music Remover Filter
|
113 |
+
Save separated files to output: Save separated files to output
|
114 |
+
Offload sub model after removing background music: Offload sub model after removing background music
|
115 |
+
Voice Detection Filter: Voice Detection Filter
|
116 |
+
Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
|
117 |
+
Enable Silero VAD Filter: Enable Silero VAD Filter
|
118 |
+
Diarization: Diarization
|
119 |
+
Enable Diarization: Enable Diarization
|
120 |
+
HuggingFace Token: HuggingFace Token
|
121 |
+
This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
|
122 |
+
Device: Device
|
123 |
+
Youtube Link: Youtube Link
|
124 |
+
Youtube Thumbnail: Youtube Thumbnail
|
125 |
+
Youtube Title: Youtube Title
|
126 |
+
Youtube Description: Youtube Description
|
127 |
+
Record with Mic: Record with Mic
|
128 |
+
Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
|
129 |
+
Your Auth Key (API KEY): Your Auth Key (API KEY)
|
130 |
+
Source Language: Source Language
|
131 |
+
Target Language: Target Language
|
132 |
+
Pro User?: Pro User?
|
133 |
+
TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
|
134 |
+
Upload Audio Files to separate background music: Upload Audio Files to separate background music
|
135 |
+
Instrumental: Instrumental
|
136 |
+
Vocals: Vocals
|
137 |
+
SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
|
138 |
+
|
139 |
+
es: # Spanish
|
140 |
+
Language: Idioma
|
141 |
+
File: File
|
142 |
+
Youtube: Youtube
|
143 |
+
Mic: Mic
|
144 |
+
T2T Translation: T2T Translation
|
145 |
+
BGM Separation: BGM Separation
|
146 |
+
GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
|
147 |
+
Output: Output
|
148 |
+
Downloadable output file: Downloadable output file
|
149 |
+
Upload File here: Upload File here
|
150 |
+
Model: Model
|
151 |
+
Automatic Detection: Automatic Detection
|
152 |
+
File Format: File Format
|
153 |
+
Translate to English?: Translate to English?
|
154 |
+
Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
|
155 |
+
Advanced Parameters: Advanced Parameters
|
156 |
+
Background Music Remover Filter: Background Music Remover Filter
|
157 |
+
Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
|
158 |
+
Enable Background Music Remover Filter: Enable Background Music Remover Filter
|
159 |
+
Save separated files to output: Save separated files to output
|
160 |
+
Offload sub model after removing background music: Offload sub model after removing background music
|
161 |
+
Voice Detection Filter: Voice Detection Filter
|
162 |
+
Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
|
163 |
+
Enable Silero VAD Filter: Enable Silero VAD Filter
|
164 |
+
Diarization: Diarization
|
165 |
+
Enable Diarization: Enable Diarization
|
166 |
+
HuggingFace Token: HuggingFace Token
|
167 |
+
This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
|
168 |
+
Device: Device
|
169 |
+
Youtube Link: Youtube Link
|
170 |
+
Youtube Thumbnail: Youtube Thumbnail
|
171 |
+
Youtube Title: Youtube Title
|
172 |
+
Youtube Description: Youtube Description
|
173 |
+
Record with Mic: Record with Mic
|
174 |
+
Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
|
175 |
+
Your Auth Key (API KEY): Your Auth Key (API KEY)
|
176 |
+
Source Language: Source Language
|
177 |
+
Target Language: Target Language
|
178 |
+
Pro User?: Pro User?
|
179 |
+
TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
|
180 |
+
Upload Audio Files to separate background music: Upload Audio Files to separate background music
|
181 |
+
Instrumental: Instrumental
|
182 |
+
Vocals: Vocals
|
183 |
+
SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
|
184 |
+
|
185 |
+
fr: # French
|
186 |
+
Language: Langue
|
187 |
+
File: File
|
188 |
+
Youtube: Youtube
|
189 |
+
Mic: Mic
|
190 |
+
T2T Translation: T2T Translation
|
191 |
+
BGM Separation: BGM Separation
|
192 |
+
GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
|
193 |
+
Output: Output
|
194 |
+
Downloadable output file: Downloadable output file
|
195 |
+
Upload File here: Upload File here
|
196 |
+
Model: Model
|
197 |
+
Automatic Detection: Automatic Detection
|
198 |
+
File Format: File Format
|
199 |
+
Translate to English?: Translate to English?
|
200 |
+
Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
|
201 |
+
Advanced Parameters: Advanced Parameters
|
202 |
+
Background Music Remover Filter: Background Music Remover Filter
|
203 |
+
Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
|
204 |
+
Enable Background Music Remover Filter: Enable Background Music Remover Filter
|
205 |
+
Save separated files to output: Save separated files to output
|
206 |
+
Offload sub model after removing background music: Offload sub model after removing background music
|
207 |
+
Voice Detection Filter: Voice Detection Filter
|
208 |
+
Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
|
209 |
+
Enable Silero VAD Filter: Enable Silero VAD Filter
|
210 |
+
Diarization: Diarization
|
211 |
+
Enable Diarization: Enable Diarization
|
212 |
+
HuggingFace Token: HuggingFace Token
|
213 |
+
This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
|
214 |
+
Device: Device
|
215 |
+
Youtube Link: Youtube Link
|
216 |
+
Youtube Thumbnail: Youtube Thumbnail
|
217 |
+
Youtube Title: Youtube Title
|
218 |
+
Youtube Description: Youtube Description
|
219 |
+
Record with Mic: Record with Mic
|
220 |
+
Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
|
221 |
+
Your Auth Key (API KEY): Your Auth Key (API KEY)
|
222 |
+
Source Language: Source Language
|
223 |
+
Target Language: Target Language
|
224 |
+
Pro User?: Pro User?
|
225 |
+
TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
|
226 |
+
Upload Audio Files to separate background music: Upload Audio Files to separate background music
|
227 |
+
Instrumental: Instrumental
|
228 |
+
Vocals: Vocals
|
229 |
+
SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
|
230 |
+
|
231 |
+
de: # German
|
232 |
+
Language: Sprache
|
233 |
+
File: File
|
234 |
+
Youtube: Youtube
|
235 |
+
Mic: Mic
|
236 |
+
T2T Translation: T2T Translation
|
237 |
+
BGM Separation: BGM Separation
|
238 |
+
GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
|
239 |
+
Output: Output
|
240 |
+
Downloadable output file: Downloadable output file
|
241 |
+
Upload File here: Upload File here
|
242 |
+
Model: Model
|
243 |
+
Automatic Detection: Automatic Detection
|
244 |
+
File Format: File Format
|
245 |
+
Translate to English?: Translate to English?
|
246 |
+
Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
|
247 |
+
Advanced Parameters: Advanced Parameters
|
248 |
+
Background Music Remover Filter: Background Music Remover Filter
|
249 |
+
Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
|
250 |
+
Enable Background Music Remover Filter: Enable Background Music Remover Filter
|
251 |
+
Save separated files to output: Save separated files to output
|
252 |
+
Offload sub model after removing background music: Offload sub model after removing background music
|
253 |
+
Voice Detection Filter: Voice Detection Filter
|
254 |
+
Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
|
255 |
+
Enable Silero VAD Filter: Enable Silero VAD Filter
|
256 |
+
Diarization: Diarization
|
257 |
+
Enable Diarization: Enable Diarization
|
258 |
+
HuggingFace Token: HuggingFace Token
|
259 |
+
This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
|
260 |
+
Device: Device
|
261 |
+
Youtube Link: Youtube Link
|
262 |
+
Youtube Thumbnail: Youtube Thumbnail
|
263 |
+
Youtube Title: Youtube Title
|
264 |
+
Youtube Description: Youtube Description
|
265 |
+
Record with Mic: Record with Mic
|
266 |
+
Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
|
267 |
+
Your Auth Key (API KEY): Your Auth Key (API KEY)
|
268 |
+
Source Language: Source Language
|
269 |
+
Target Language: Target Language
|
270 |
+
Pro User?: Pro User?
|
271 |
+
TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
|
272 |
+
Upload Audio Files to separate background music: Upload Audio Files to separate background music
|
273 |
+
Instrumental: Instrumental
|
274 |
+
Vocals: Vocals
|
275 |
+
SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
|
276 |
+
|
277 |
+
zh: # Chinese
|
278 |
+
Language: 语言
|
279 |
+
File: File
|
280 |
+
Youtube: Youtube
|
281 |
+
Mic: Mic
|
282 |
+
T2T Translation: T2T Translation
|
283 |
+
BGM Separation: BGM Separation
|
284 |
+
GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
|
285 |
+
Output: Output
|
286 |
+
Downloadable output file: Downloadable output file
|
287 |
+
Upload File here: Upload File here
|
288 |
+
Model: Model
|
289 |
+
Automatic Detection: Automatic Detection
|
290 |
+
File Format: File Format
|
291 |
+
Translate to English?: Translate to English?
|
292 |
+
Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
|
293 |
+
Advanced Parameters: Advanced Parameters
|
294 |
+
Background Music Remover Filter: Background Music Remover Filter
|
295 |
+
Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
|
296 |
+
Enable Background Music Remover Filter: Enable Background Music Remover Filter
|
297 |
+
Save separated files to output: Save separated files to output
|
298 |
+
Offload sub model after removing background music: Offload sub model after removing background music
|
299 |
+
Voice Detection Filter: Voice Detection Filter
|
300 |
+
Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
|
301 |
+
Enable Silero VAD Filter: Enable Silero VAD Filter
|
302 |
+
Diarization: Diarization
|
303 |
+
Enable Diarization: Enable Diarization
|
304 |
+
HuggingFace Token: HuggingFace Token
|
305 |
+
This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
|
306 |
+
Device: Device
|
307 |
+
Youtube Link: Youtube Link
|
308 |
+
Youtube Thumbnail: Youtube Thumbnail
|
309 |
+
Youtube Title: Youtube Title
|
310 |
+
Youtube Description: Youtube Description
|
311 |
+
Record with Mic: Record with Mic
|
312 |
+
Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
|
313 |
+
Your Auth Key (API KEY): Your Auth Key (API KEY)
|
314 |
+
Source Language: Source Language
|
315 |
+
Target Language: Target Language
|
316 |
+
Pro User?: Pro User?
|
317 |
+
TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
|
318 |
+
Upload Audio Files to separate background music: Upload Audio Files to separate background music
|
319 |
+
Instrumental: Instrumental
|
320 |
+
Vocals: Vocals
|
321 |
+
SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
|
322 |
+
|
323 |
+
uk: # Ukrainian
|
324 |
+
Language: Мова
|
325 |
+
File: Файл
|
326 |
+
Youtube: Youtube
|
327 |
+
Mic: Мікрофон
|
328 |
+
T2T Translation: T2T Переклад
|
329 |
+
BGM Separation: Розділення фонової музики
|
330 |
+
GENERATE SUBTITLE FILE: СТВОРИТИ ФАЙЛ СУБТИТРІВ
|
331 |
+
Output: Результат
|
332 |
+
Downloadable output file: Завантажуваний файл результату
|
333 |
+
Upload File here: Завантажте файл тут
|
334 |
+
Model: Модель
|
335 |
+
Automatic Detection: Автоматичне визначення
|
336 |
+
File Format: Формат файлу
|
337 |
+
Translate to English?: Перекласти на англійську?
|
338 |
+
Add a timestamp to the end of the filename: Додати мітку часу до кінця імені файлу
|
339 |
+
Advanced Parameters: Розширені параметри
|
340 |
+
Background Music Remover Filter: Фільтр видалення фонової музики
|
341 |
+
Enabling this will remove background music: Увімкнення цього видалить фонову музику за допомогою підмоделі перед транскрипцією
|
342 |
+
Enable Background Music Remover Filter: Увімкнути фільтр видалення фонової музики
|
343 |
+
Save separated files to output: Зберегти розділені файли до вихідної папки
|
344 |
+
Offload sub model after removing background music: Вивантажити підмодель після видалення фонової музики
|
345 |
+
Voice Detection Filter: Фільтр розпізнавання голосу
|
346 |
+
Enable this to transcribe only detected voice: Увімкніть це, щоб транскрибувати лише розпізнані голосові частини за допомогою підмоделі
|
347 |
+
Enable Silero VAD Filter: Увімкнути фільтр Silero VAD
|
348 |
+
Diarization: Діаризація
|
349 |
+
Enable Diarization: Увімкнути діаризацію
|
350 |
+
HuggingFace Token: Токен HuggingFace
|
351 |
+
This is only needed the first time you download the model: Це потрібно лише при першому завантаженні моделі. Якщо у вас вже є моделі, вводити не потрібно. Щоб завантажити модель, потрібно вручну перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" та "https://huggingface.co/pyannote/segmentation-3.0" і погодитися з їхніми вимогами.
|
352 |
+
Device: Пристрій
|
353 |
+
Youtube Link: Посилання на Youtube
|
354 |
+
Youtube Thumbnail: Ескіз Youtube
|
355 |
+
Youtube Title: Назва Youtube
|
356 |
+
Youtube Description: Опис Youtube
|
357 |
+
Record with Mic: Записати з мікрофона
|
358 |
+
Upload Subtitle Files to translate here: Завантажте файли субтитрів для перекладу тут
|
359 |
+
Your Auth Key (API KEY): Ваш ключ авторизації (API KEY)
|
360 |
+
Source Language: Мова джерела
|
361 |
+
Target Language: Мова перекладу
|
362 |
+
Pro User?: Професійний користувач?
|
363 |
+
TRANSLATE SUBTITLE FILE: ПЕРЕКЛАСТИ ФАЙЛ СУБТИТРІВ
|
364 |
+
Upload Audio Files to separate background music: Завантажте аудіофайли для розділення фонової музики
|
365 |
+
Instrumental: Інструментал
|
366 |
+
Vocals: Вокал
|
367 |
+
SEPARATE BACKGROUND MUSIC: РОЗДІЛИТИ ФОНОВУ МУЗИКУ
|
368 |
+
|
369 |
+
ru: # Russian
|
370 |
+
Language: Язык
|
371 |
+
File: Файл
|
372 |
+
Youtube: Youtube
|
373 |
+
Mic: Микрофон
|
374 |
+
T2T Translation: Перевод T2T
|
375 |
+
BGM Separation: Разделение фоновой музыки
|
376 |
+
GENERATE SUBTITLE FILE: СГЕНЕРИРОВАТЬ ФАЙЛ СУБТИТРОВ
|
377 |
+
Output: Результат
|
378 |
+
Downloadable output file: Загружаемый файл результата
|
379 |
+
Upload File here: Загрузите файл здесь
|
380 |
+
Model: Модель
|
381 |
+
Automatic Detection: Автоматическое определение
|
382 |
+
File Format: Формат файла
|
383 |
+
Translate to English?: Перевести на английский?
|
384 |
+
Add a timestamp to the end of the filename: Добавить метку времени в конец имени файла
|
385 |
+
Advanced Parameters: Расширенные параметры
|
386 |
+
Background Music Remover Filter: Фильтр удаления фоновой музыки
|
387 |
+
Enabling this will remove background music: Включение этого удалит фоновую музыку с помощью подмодели перед транскрипцией
|
388 |
+
Enable Background Music Remover Filter: Включить фильтр удаления фоновой музыки
|
389 |
+
Save separated files to output: Сохранить разделенные файлы в выходную папку
|
390 |
+
Offload sub model after removing background music: Выгрузить подмодель после удаления фоновой музыки
|
391 |
+
Voice Detection Filter: Фильтр обнаружения голоса
|
392 |
+
Enable this to transcribe only detected voice: Включите это, чтобы транскрибировать только обнаруженные голосовые части с помощью подмодели
|
393 |
+
Enable Silero VAD Filter: Включить фильтр Silero VAD
|
394 |
+
Diarization: Диаризация
|
395 |
+
Enable Diarization: Включить диаризацию
|
396 |
+
HuggingFace Token: Токен HuggingFace
|
397 |
+
This is only needed the first time you download the model: Это нужно только при первом скачивании модели. Если у вас уже есть модели, вводить не нужно. Чтобы скачать модель, нужно вручную перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" и "https://huggingface.co/pyannote/segmentation-3.0" и согласиться с их требованиями.
|
398 |
+
Device: Устройство
|
399 |
+
Youtube Link: Ссылка на Youtube
|
400 |
+
Youtube Thumbnail: Миниатюра Youtube
|
401 |
+
Youtube Title: Название Youtube
|
402 |
+
Youtube Description: Описание Youtube
|
403 |
+
Record with Mic: Записать с микрофона
|
404 |
+
Upload Subtitle Files to translate here: Загрузите файлы субтитров для перевода здесь
|
405 |
+
Your Auth Key (API KEY): Ваш Auth Key (API KEY)
|
406 |
+
Source Language: Исходный язык
|
407 |
+
Target Language: Целевой язык
|
408 |
+
Pro User?: Профессиональный пользователь?
|
409 |
+
TRANSLATE SUBTITLE FILE: ПЕРЕВЕСТИ ФАЙЛ СУБТИТРОВ
|
410 |
+
Upload Audio Files to separate background music: Загрузите аудиофайлы для разделения фоновой музыки
|
411 |
+
Instrumental: Инструментал
|
412 |
+
Vocals: Вокал
|
413 |
+
SEPARATE BACKGROUND MUSIC: РАЗДЕЛИТЬ ФОНОВУЮ МУЗЫКУ
|
414 |
+
|
415 |
+
tr: # Turkish
|
416 |
+
Language: Dil
|
417 |
+
File: Dosya
|
418 |
+
Youtube: Youtube
|
419 |
+
Mic: Mikrofon
|
420 |
+
T2T Translation: T2T Çeviri
|
421 |
+
BGM Separation: Arka Plan Müziği Ayırma
|
422 |
+
GENERATE SUBTITLE FILE: ALTYAZI DOSYASI OLUŞTUR
|
423 |
+
Output: Çıktı
|
424 |
+
Downloadable output file: İndirilebilir çıktı dosyası
|
425 |
+
Upload File here: Dosya Yükle
|
426 |
+
Model: Model
|
427 |
+
Automatic Detection: Otomatik Algılama
|
428 |
+
File Format: Dosya Formatı
|
429 |
+
Translate to English?: İngilizceye Çevir?
|
430 |
+
Add a timestamp to the end of the filename: Dosya adının sonuna zaman damgası ekle
|
431 |
+
Advanced Parameters: Gelişmiş Parametreler
|
432 |
+
Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresi
|
433 |
+
Enabling this will remove background music: Bunu etkinleştirmek, arka plan müziğini alt model tarafından transkripsiyondan önce kaldıracaktır
|
434 |
+
Enable Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresini Etkinleştir
|
435 |
+
Save separated files to output: Ayrılmış dosyaları çıktıya kaydet
|
436 |
+
Offload sub model after removing background music: Arka plan müziği kaldırıldıktan sonra alt modeli devre dışı bırak
|
437 |
+
Voice Detection Filter: Ses Algılama Filtresi
|
438 |
+
Enable this to transcribe only detected voice: Bunu etkinleştirerek yalnızca alt model tarafından algılanan ses kısımlarını transkribe et
|
439 |
+
Enable Silero VAD Filter: Silero VAD Filtresini Etkinleştir
|
440 |
+
Diarization: Konuşmacı Ayrımı
|
441 |
+
Enable Diarization: Konuşmacı Ayrımını Etkinleştir
|
442 |
+
HuggingFace Token: HuggingFace Anahtarı
|
443 |
+
This is only needed the first time you download the model: Bu, modeli ilk kez indirirken gereklidir. Zaten modelleriniz varsa girmenize gerek yok. Modeli indirmek için "https://huggingface.co/pyannote/speaker-diarization-3.1" ve "https://huggingface.co/pyannote/segmentation-3.0" adreslerine gidip gereksinimlerini kabul etmeniz gerekiyor
|
444 |
+
Device: Cihaz
|
445 |
+
Youtube Link: Youtube Bağlantısı
|
446 |
+
Youtube Thumbnail: Youtube Küçük Resmi
|
447 |
+
Youtube Title: Youtube Başlığı
|
448 |
+
Youtube Description: Youtube Açıklaması
|
449 |
+
Record with Mic: Mikrofonla Kaydet
|
450 |
+
Upload Subtitle Files to translate here: Çeviri için altyazı dosyalarını buraya yükle
|
451 |
+
Your Auth Key (API KEY): Yetki Anahtarınız (API ANAHTARI)
|
452 |
+
Source Language: Kaynak Dil
|
453 |
+
Target Language: Hedef Dil
|
454 |
+
Pro User?: Pro Kullanıcı?
|
455 |
+
TRANSLATE SUBTITLE FILE: ALTYAZI DOSYASINI ÇEVİR
|
456 |
+
Upload Audio Files to separate background music: Arka plan müziğini ayırmak için ses dosyalarını yükle
|
457 |
+
Instrumental: Enstrümantal
|
458 |
+
Vocals: Vokal
|
459 |
+
SEPARATE BACKGROUND MUSIC: ARKA PLAN MÜZİĞİNİ AYIR
|
modules/diarize/diarize_pipeline.py
CHANGED
@@ -7,6 +7,7 @@ from pyannote.audio import Pipeline
|
|
7 |
from typing import Optional, Union
|
8 |
import torch
|
9 |
|
|
|
10 |
from modules.utils.paths import DIARIZATION_MODELS_DIR
|
11 |
from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
|
12 |
|
@@ -43,6 +44,8 @@ class DiarizationPipeline:
|
|
43 |
|
44 |
def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
|
45 |
transcript_segments = transcript_result["segments"]
|
|
|
|
|
46 |
for seg in transcript_segments:
|
47 |
# assign speaker to segment (if any)
|
48 |
diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
|
@@ -63,7 +66,7 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
|
|
63 |
seg["speaker"] = speaker
|
64 |
|
65 |
# assign speaker to words
|
66 |
-
if 'words' in seg:
|
67 |
for word in seg['words']:
|
68 |
if 'start' in word:
|
69 |
diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
|
@@ -85,10 +88,10 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
|
|
85 |
if word_speaker is not None:
|
86 |
word["speaker"] = word_speaker
|
87 |
|
88 |
-
return
|
89 |
|
90 |
|
91 |
-
class
|
92 |
def __init__(self, start, end, speaker=None):
|
93 |
self.start = start
|
94 |
self.end = end
|
|
|
7 |
from typing import Optional, Union
|
8 |
import torch
|
9 |
|
10 |
+
from modules.whisper.data_classes import *
|
11 |
from modules.utils.paths import DIARIZATION_MODELS_DIR
|
12 |
from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
|
13 |
|
|
|
44 |
|
45 |
def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
|
46 |
transcript_segments = transcript_result["segments"]
|
47 |
+
if transcript_segments and isinstance(transcript_segments[0], Segment):
|
48 |
+
transcript_segments = [seg.model_dump() for seg in transcript_segments]
|
49 |
for seg in transcript_segments:
|
50 |
# assign speaker to segment (if any)
|
51 |
diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
|
|
|
66 |
seg["speaker"] = speaker
|
67 |
|
68 |
# assign speaker to words
|
69 |
+
if 'words' in seg and seg['words'] is not None:
|
70 |
for word in seg['words']:
|
71 |
if 'start' in word:
|
72 |
diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
|
|
|
88 |
if word_speaker is not None:
|
89 |
word["speaker"] = word_speaker
|
90 |
|
91 |
+
return {"segments": transcript_segments}
|
92 |
|
93 |
|
94 |
+
class DiarizationSegment:
|
95 |
def __init__(self, start, end, speaker=None):
|
96 |
self.start = start
|
97 |
self.end = end
|
modules/diarize/diarizer.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
import torch
|
3 |
-
from typing import List, Union, BinaryIO, Optional
|
4 |
import numpy as np
|
5 |
import time
|
6 |
import logging
|
@@ -9,6 +9,7 @@ import spaces
|
|
9 |
from modules.utils.paths import DIARIZATION_MODELS_DIR
|
10 |
from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
|
11 |
from modules.diarize.audio_loader import load_audio
|
|
|
12 |
|
13 |
|
14 |
class Diarizer:
|
@@ -25,10 +26,10 @@ class Diarizer:
|
|
25 |
@spaces.GPU
|
26 |
def run(self,
|
27 |
audio: Union[str, BinaryIO, np.ndarray],
|
28 |
-
transcribed_result: List[
|
29 |
use_auth_token: str,
|
30 |
device: Optional[str] = None
|
31 |
-
):
|
32 |
"""
|
33 |
Diarize transcribed result as a post-processing
|
34 |
|
@@ -36,7 +37,7 @@ class Diarizer:
|
|
36 |
----------
|
37 |
audio: Union[str, BinaryIO, np.ndarray]
|
38 |
Audio input. This can be file path or binary type.
|
39 |
-
transcribed_result: List[
|
40 |
transcribed result through whisper.
|
41 |
use_auth_token: str
|
42 |
Huggingface token with READ permission. This is only needed the first time you download the model.
|
@@ -46,8 +47,8 @@ class Diarizer:
|
|
46 |
|
47 |
Returns
|
48 |
----------
|
49 |
-
segments_result: List[
|
50 |
-
list of
|
51 |
elapsed_time: float
|
52 |
elapsed time for running
|
53 |
"""
|
@@ -70,14 +71,20 @@ class Diarizer:
|
|
70 |
{"segments": transcribed_result}
|
71 |
)
|
72 |
|
|
|
73 |
for segment in diarized_result["segments"]:
|
74 |
speaker = "None"
|
75 |
if "speaker" in segment:
|
76 |
speaker = segment["speaker"]
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
elapsed_time = time.time() - start_time
|
80 |
-
return
|
81 |
|
82 |
@spaces.GPU
|
83 |
def update_pipe(self,
|
|
|
1 |
import os
|
2 |
import torch
|
3 |
+
from typing import List, Union, BinaryIO, Optional, Tuple
|
4 |
import numpy as np
|
5 |
import time
|
6 |
import logging
|
|
|
9 |
from modules.utils.paths import DIARIZATION_MODELS_DIR
|
10 |
from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
|
11 |
from modules.diarize.audio_loader import load_audio
|
12 |
+
from modules.whisper.data_classes import *
|
13 |
|
14 |
|
15 |
class Diarizer:
|
|
|
26 |
@spaces.GPU
|
27 |
def run(self,
|
28 |
audio: Union[str, BinaryIO, np.ndarray],
|
29 |
+
transcribed_result: List[Segment],
|
30 |
use_auth_token: str,
|
31 |
device: Optional[str] = None
|
32 |
+
) -> Tuple[List[Segment], float]:
|
33 |
"""
|
34 |
Diarize transcribed result as a post-processing
|
35 |
|
|
|
37 |
----------
|
38 |
audio: Union[str, BinaryIO, np.ndarray]
|
39 |
Audio input. This can be file path or binary type.
|
40 |
+
transcribed_result: List[Segment]
|
41 |
transcribed result through whisper.
|
42 |
use_auth_token: str
|
43 |
Huggingface token with READ permission. This is only needed the first time you download the model.
|
|
|
47 |
|
48 |
Returns
|
49 |
----------
|
50 |
+
segments_result: List[Segment]
|
51 |
+
list of Segment that includes start, end timestamps and transcribed text
|
52 |
elapsed_time: float
|
53 |
elapsed time for running
|
54 |
"""
|
|
|
71 |
{"segments": transcribed_result}
|
72 |
)
|
73 |
|
74 |
+
segments_result = []
|
75 |
for segment in diarized_result["segments"]:
|
76 |
speaker = "None"
|
77 |
if "speaker" in segment:
|
78 |
speaker = segment["speaker"]
|
79 |
+
diarized_text = speaker + "|" + segment["text"].strip()
|
80 |
+
segments_result.append(Segment(
|
81 |
+
start=segment["start"],
|
82 |
+
end=segment["end"],
|
83 |
+
text=diarized_text
|
84 |
+
))
|
85 |
|
86 |
elapsed_time = time.time() - start_time
|
87 |
+
return segments_result, elapsed_time
|
88 |
|
89 |
@spaces.GPU
|
90 |
def update_pipe(self,
|
modules/translation/deepl_api.py
CHANGED
@@ -5,6 +5,7 @@ from datetime import datetime
|
|
5 |
import gradio as gr
|
6 |
|
7 |
from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
|
|
|
8 |
from modules.utils.subtitle_manager import *
|
9 |
from modules.utils.files_manager import load_yaml, save_yaml
|
10 |
|
@@ -50,7 +51,7 @@ DEEPL_AVAILABLE_TARGET_LANGS = {
|
|
50 |
}
|
51 |
|
52 |
DEEPL_AVAILABLE_SOURCE_LANGS = {
|
53 |
-
|
54 |
'Bulgarian': 'BG',
|
55 |
'Czech': 'CS',
|
56 |
'Danish': 'DA',
|
@@ -138,37 +139,27 @@ class DeepLAPI:
|
|
138 |
)
|
139 |
|
140 |
files_info = {}
|
141 |
-
for
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
if file_ext == ".srt":
|
146 |
-
parsed_dicts = parse_srt(file_path=file_path)
|
147 |
-
|
148 |
-
elif file_ext == ".vtt":
|
149 |
-
parsed_dicts = parse_vtt(file_path=file_path)
|
150 |
|
151 |
batch_size = self.max_text_batch_size
|
152 |
-
for batch_start in range(0, len(
|
153 |
-
|
154 |
-
sentences_to_translate = [
|
155 |
translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
|
156 |
target_lang, is_pro)
|
157 |
for i, translated_text in enumerate(translated_texts):
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
168 |
-
file_name += f"-{timestamp}"
|
169 |
-
|
170 |
-
output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
|
171 |
-
write_file(subtitle, output_path)
|
172 |
|
173 |
files_info[file_name] = {"subtitle": subtitle, "path": output_path}
|
174 |
|
|
|
5 |
import gradio as gr
|
6 |
|
7 |
from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
|
8 |
+
from modules.utils.constants import AUTOMATIC_DETECTION
|
9 |
from modules.utils.subtitle_manager import *
|
10 |
from modules.utils.files_manager import load_yaml, save_yaml
|
11 |
|
|
|
51 |
}
|
52 |
|
53 |
DEEPL_AVAILABLE_SOURCE_LANGS = {
|
54 |
+
AUTOMATIC_DETECTION: None,
|
55 |
'Bulgarian': 'BG',
|
56 |
'Czech': 'CS',
|
57 |
'Danish': 'DA',
|
|
|
139 |
)
|
140 |
|
141 |
files_info = {}
|
142 |
+
for file_path in fileobjs:
|
143 |
+
file_name, file_ext = os.path.splitext(os.path.basename(file_path))
|
144 |
+
writer = get_writer(file_ext, self.output_dir)
|
145 |
+
segments = writer.to_segments(file_path)
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
batch_size = self.max_text_batch_size
|
148 |
+
for batch_start in range(0, len(segments), batch_size):
|
149 |
+
progress(batch_start / len(segments), desc="Translating..")
|
150 |
+
sentences_to_translate = [seg.text for seg in segments[batch_start:batch_start+batch_size]]
|
151 |
translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
|
152 |
target_lang, is_pro)
|
153 |
for i, translated_text in enumerate(translated_texts):
|
154 |
+
segments[batch_start + i].text = translated_text["text"]
|
155 |
+
|
156 |
+
subtitle, output_path = generate_file(
|
157 |
+
output_dir=self.output_dir,
|
158 |
+
output_file_name=file_name,
|
159 |
+
output_format=file_ext,
|
160 |
+
result=segments,
|
161 |
+
add_timestamp=add_timestamp
|
162 |
+
)
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
files_info[file_name] = {"subtitle": subtitle, "path": output_path}
|
165 |
|
modules/translation/nllb_inference.py
CHANGED
@@ -4,10 +4,10 @@ import os
|
|
4 |
import spaces
|
5 |
|
6 |
from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
|
7 |
-
|
8 |
|
9 |
|
10 |
-
class NLLBInference(TranslationBase):
|
11 |
def __init__(self,
|
12 |
model_dir: str = NLLB_MODELS_DIR,
|
13 |
output_dir: str = TRANSLATION_OUTPUT_DIR
|
@@ -31,7 +31,7 @@ class NLLBInference(TranslationBase):
|
|
31 |
text,
|
32 |
max_length=max_length
|
33 |
)
|
34 |
-
return result[0][
|
35 |
|
36 |
@spaces.GPU(duration=120)
|
37 |
def update_model(self,
|
@@ -44,8 +44,7 @@ class NLLBInference(TranslationBase):
|
|
44 |
if lang in NLLB_AVAILABLE_LANGS:
|
45 |
return NLLB_AVAILABLE_LANGS[lang]
|
46 |
elif lang not in NLLB_AVAILABLE_LANGS.values():
|
47 |
-
raise ValueError(
|
48 |
-
f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
|
49 |
return lang
|
50 |
|
51 |
src_lang = validate_language(src_lang)
|
|
|
4 |
import spaces
|
5 |
|
6 |
from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
|
7 |
+
import modules.translation.translation_base as base
|
8 |
|
9 |
|
10 |
+
class NLLBInference(base.TranslationBase):
|
11 |
def __init__(self,
|
12 |
model_dir: str = NLLB_MODELS_DIR,
|
13 |
output_dir: str = TRANSLATION_OUTPUT_DIR
|
|
|
31 |
text,
|
32 |
max_length=max_length
|
33 |
)
|
34 |
+
return result[0]["translation_text"]
|
35 |
|
36 |
@spaces.GPU(duration=120)
|
37 |
def update_model(self,
|
|
|
44 |
if lang in NLLB_AVAILABLE_LANGS:
|
45 |
return NLLB_AVAILABLE_LANGS[lang]
|
46 |
elif lang not in NLLB_AVAILABLE_LANGS.values():
|
47 |
+
raise ValueError(f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
|
|
|
48 |
return lang
|
49 |
|
50 |
src_lang = validate_language(src_lang)
|
modules/translation/translation_base.py
CHANGED
@@ -6,7 +6,8 @@ from typing import List
|
|
6 |
from datetime import datetime
|
7 |
import spaces
|
8 |
|
9 |
-
|
|
|
10 |
from modules.utils.subtitle_manager import *
|
11 |
from modules.utils.files_manager import load_yaml, save_yaml
|
12 |
from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
|
@@ -98,32 +99,22 @@ class TranslationBase(ABC):
|
|
98 |
files_info = {}
|
99 |
for fileobj in fileobjs:
|
100 |
file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
subtitle = get_serialized_vtt(parsed_dicts)
|
118 |
-
|
119 |
-
if add_timestamp:
|
120 |
-
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
121 |
-
file_name += f"-{timestamp}"
|
122 |
-
|
123 |
-
output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
|
124 |
-
write_file(subtitle, output_path)
|
125 |
-
|
126 |
-
files_info[file_name] = {"subtitle": subtitle, "path": output_path}
|
127 |
|
128 |
total_result = ''
|
129 |
for file_name, info in files_info.items():
|
@@ -136,7 +127,8 @@ class TranslationBase(ABC):
|
|
136 |
return [gr_str, output_file_paths]
|
137 |
|
138 |
except Exception as e:
|
139 |
-
print(f"Error: {
|
|
|
140 |
finally:
|
141 |
self.release_cuda_memory()
|
142 |
|
@@ -172,11 +164,17 @@ class TranslationBase(ABC):
|
|
172 |
tgt_lang: str,
|
173 |
max_length: int,
|
174 |
add_timestamp: bool):
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
176 |
cached_params["translation"]["nllb"] = {
|
177 |
"model_size": model_size,
|
178 |
-
"source_lang": src_lang,
|
179 |
-
"target_lang": tgt_lang,
|
180 |
"max_length": max_length,
|
181 |
}
|
182 |
cached_params["translation"]["add_timestamp"] = add_timestamp
|
|
|
6 |
from datetime import datetime
|
7 |
import spaces
|
8 |
|
9 |
+
import modules.translation.nllb_inference as nllb
|
10 |
+
from modules.whisper.data_classes import *
|
11 |
from modules.utils.subtitle_manager import *
|
12 |
from modules.utils.files_manager import load_yaml, save_yaml
|
13 |
from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
|
|
|
99 |
files_info = {}
|
100 |
for fileobj in fileobjs:
|
101 |
file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
|
102 |
+
writer = get_writer(file_ext, self.output_dir)
|
103 |
+
segments = writer.to_segments(fileobj)
|
104 |
+
for i, segment in enumerate(segments):
|
105 |
+
progress(i / len(segments), desc="Translating..")
|
106 |
+
translated_text = self.translate(segment.text, max_length=max_length)
|
107 |
+
segment.text = translated_text
|
108 |
+
|
109 |
+
subtitle, file_path = generate_file(
|
110 |
+
output_dir=self.output_dir,
|
111 |
+
output_file_name=file_name,
|
112 |
+
output_format=file_ext,
|
113 |
+
result=segments,
|
114 |
+
add_timestamp=add_timestamp
|
115 |
+
)
|
116 |
+
|
117 |
+
files_info[file_name] = {"subtitle": subtitle, "path": file_path}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
total_result = ''
|
120 |
for file_name, info in files_info.items():
|
|
|
127 |
return [gr_str, output_file_paths]
|
128 |
|
129 |
except Exception as e:
|
130 |
+
print(f"Error translating file: {e}")
|
131 |
+
raise
|
132 |
finally:
|
133 |
self.release_cuda_memory()
|
134 |
|
|
|
164 |
tgt_lang: str,
|
165 |
max_length: int,
|
166 |
add_timestamp: bool):
|
167 |
+
def validate_lang(lang: str):
|
168 |
+
if lang in list(nllb.NLLB_AVAILABLE_LANGS.values()):
|
169 |
+
flipped = {value: key for key, value in nllb.NLLB_AVAILABLE_LANGS.items()}
|
170 |
+
return flipped[lang]
|
171 |
+
return lang
|
172 |
+
|
173 |
cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
174 |
cached_params["translation"]["nllb"] = {
|
175 |
"model_size": model_size,
|
176 |
+
"source_lang": validate_lang(src_lang),
|
177 |
+
"target_lang": validate_lang(tgt_lang),
|
178 |
"max_length": max_length,
|
179 |
}
|
180 |
cached_params["translation"]["add_timestamp"] = add_timestamp
|
modules/utils/constants.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gradio_i18n import Translate, gettext as _
|
2 |
+
|
3 |
+
AUTOMATIC_DETECTION = _("Automatic Detection")
|
4 |
+
GRADIO_NONE_STR = ""
|
5 |
+
GRADIO_NONE_NUMBER_MAX = 9999
|
6 |
+
GRADIO_NONE_NUMBER_MIN = 0
|
modules/utils/files_manager.py
CHANGED
@@ -67,3 +67,9 @@ def is_video(file_path):
|
|
67 |
video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
|
68 |
extension = os.path.splitext(file_path)[1].lower()
|
69 |
return extension in video_extensions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
|
68 |
extension = os.path.splitext(file_path)[1].lower()
|
69 |
return extension in video_extensions
|
70 |
+
|
71 |
+
|
72 |
+
def read_file(file_path):
|
73 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
74 |
+
subtitle_content = f.read()
|
75 |
+
return subtitle_content
|
modules/utils/paths.py
CHANGED
@@ -10,6 +10,7 @@ DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
|
|
10 |
UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
|
11 |
CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
|
12 |
DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
|
|
|
13 |
OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
|
14 |
TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
|
15 |
UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
|
|
|
10 |
UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
|
11 |
CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
|
12 |
DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
|
13 |
+
I18N_YAML_PATH = os.path.join(CONFIGS_DIR, "translation.yaml")
|
14 |
OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
|
15 |
TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
|
16 |
UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
|
modules/utils/subtitle_manager.py
CHANGED
@@ -1,123 +1,427 @@
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
# Zero GPU
|
4 |
import spaces
|
5 |
|
6 |
-
def
|
7 |
-
|
8 |
-
|
9 |
-
seconds
|
10 |
-
milliseconds = (
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
def
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
@spaces.GPU(duration=120)
|
123 |
def safe_filename(name):
|
|
|
1 |
+
# Ported from https://github.com/openai/whisper/blob/main/whisper/utils.py
|
2 |
+
|
3 |
+
import json
|
4 |
+
import os
|
5 |
import re
|
6 |
+
import sys
|
7 |
+
import zlib
|
8 |
+
from typing import Callable, List, Optional, TextIO, Union, Dict, Tuple
|
9 |
+
from datetime import datetime
|
10 |
+
|
11 |
+
from modules.whisper.data_classes import Segment, Word
|
12 |
+
from .files_manager import read_file
|
13 |
|
14 |
# Zero GPU
|
15 |
import spaces
|
16 |
|
17 |
+
def format_timestamp(
|
18 |
+
seconds: float, always_include_hours: bool = True, decimal_marker: str = ","
|
19 |
+
) -> str:
|
20 |
+
assert seconds >= 0, "non-negative timestamp expected"
|
21 |
+
milliseconds = round(seconds * 1000.0)
|
22 |
+
|
23 |
+
hours = milliseconds // 3_600_000
|
24 |
+
milliseconds -= hours * 3_600_000
|
25 |
+
|
26 |
+
minutes = milliseconds // 60_000
|
27 |
+
milliseconds -= minutes * 60_000
|
28 |
+
|
29 |
+
seconds = milliseconds // 1_000
|
30 |
+
milliseconds -= seconds * 1_000
|
31 |
+
|
32 |
+
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
|
33 |
+
return (
|
34 |
+
f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
|
35 |
+
)
|
36 |
+
|
37 |
+
|
38 |
+
def time_str_to_seconds(time_str: str, decimal_marker: str = ",") -> float:
|
39 |
+
times = time_str.split(":")
|
40 |
+
|
41 |
+
if len(times) == 3:
|
42 |
+
hours, minutes, rest = times
|
43 |
+
hours = int(hours)
|
44 |
+
else:
|
45 |
+
hours = 0
|
46 |
+
minutes, rest = times
|
47 |
+
|
48 |
+
seconds, fractional = rest.split(decimal_marker)
|
49 |
+
|
50 |
+
minutes = int(minutes)
|
51 |
+
seconds = int(seconds)
|
52 |
+
fractional_seconds = float("0." + fractional)
|
53 |
+
|
54 |
+
return hours * 3600 + minutes * 60 + seconds + fractional_seconds
|
55 |
+
|
56 |
+
|
57 |
+
def get_start(segments: List[dict]) -> Optional[float]:
|
58 |
+
return next(
|
59 |
+
(w["start"] for s in segments for w in s["words"]),
|
60 |
+
segments[0]["start"] if segments else None,
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
def get_end(segments: List[dict]) -> Optional[float]:
|
65 |
+
return next(
|
66 |
+
(w["end"] for s in reversed(segments) for w in reversed(s["words"])),
|
67 |
+
segments[-1]["end"] if segments else None,
|
68 |
+
)
|
69 |
+
|
70 |
+
|
71 |
+
class ResultWriter:
|
72 |
+
extension: str
|
73 |
+
|
74 |
+
def __init__(self, output_dir: str):
|
75 |
+
self.output_dir = output_dir
|
76 |
+
|
77 |
+
def __call__(
|
78 |
+
self, result: Union[dict, List[Segment]], output_file_name: str,
|
79 |
+
options: Optional[dict] = None, **kwargs
|
80 |
+
):
|
81 |
+
if isinstance(result, List) and result and isinstance(result[0], Segment):
|
82 |
+
result = {"segments": [seg.model_dump() for seg in result]}
|
83 |
+
|
84 |
+
output_path = os.path.join(
|
85 |
+
self.output_dir, output_file_name + "." + self.extension
|
86 |
+
)
|
87 |
+
|
88 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
89 |
+
self.write_result(result, file=f, options=options, **kwargs)
|
90 |
+
|
91 |
+
def write_result(
|
92 |
+
self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
|
93 |
+
):
|
94 |
+
raise NotImplementedError
|
95 |
+
|
96 |
+
|
97 |
+
class WriteTXT(ResultWriter):
|
98 |
+
extension: str = "txt"
|
99 |
+
|
100 |
+
def write_result(
|
101 |
+
self, result: Union[Dict, List[Segment]], file: TextIO, options: Optional[dict] = None, **kwargs
|
102 |
+
):
|
103 |
+
for segment in result["segments"]:
|
104 |
+
print(segment["text"].strip(), file=file, flush=True)
|
105 |
+
|
106 |
+
|
107 |
+
class SubtitlesWriter(ResultWriter):
|
108 |
+
always_include_hours: bool
|
109 |
+
decimal_marker: str
|
110 |
+
|
111 |
+
def iterate_result(
|
112 |
+
self,
|
113 |
+
result: dict,
|
114 |
+
options: Optional[dict] = None,
|
115 |
+
*,
|
116 |
+
max_line_width: Optional[int] = None,
|
117 |
+
max_line_count: Optional[int] = None,
|
118 |
+
highlight_words: bool = False,
|
119 |
+
align_lrc_words: bool = False,
|
120 |
+
max_words_per_line: Optional[int] = None,
|
121 |
+
):
|
122 |
+
options = options or {}
|
123 |
+
max_line_width = max_line_width or options.get("max_line_width")
|
124 |
+
max_line_count = max_line_count or options.get("max_line_count")
|
125 |
+
highlight_words = highlight_words or options.get("highlight_words", False)
|
126 |
+
align_lrc_words = align_lrc_words or options.get("align_lrc_words", False)
|
127 |
+
max_words_per_line = max_words_per_line or options.get("max_words_per_line")
|
128 |
+
preserve_segments = max_line_count is None or max_line_width is None
|
129 |
+
max_line_width = max_line_width or 1000
|
130 |
+
max_words_per_line = max_words_per_line or 1000
|
131 |
+
|
132 |
+
def iterate_subtitles():
|
133 |
+
line_len = 0
|
134 |
+
line_count = 1
|
135 |
+
# the next subtitle to yield (a list of word timings with whitespace)
|
136 |
+
subtitle: List[dict] = []
|
137 |
+
last: float = get_start(result["segments"]) or 0.0
|
138 |
+
for segment in result["segments"]:
|
139 |
+
chunk_index = 0
|
140 |
+
words_count = max_words_per_line
|
141 |
+
while chunk_index < len(segment["words"]):
|
142 |
+
remaining_words = len(segment["words"]) - chunk_index
|
143 |
+
if max_words_per_line > len(segment["words"]) - chunk_index:
|
144 |
+
words_count = remaining_words
|
145 |
+
for i, original_timing in enumerate(
|
146 |
+
segment["words"][chunk_index : chunk_index + words_count]
|
147 |
+
):
|
148 |
+
timing = original_timing.copy()
|
149 |
+
long_pause = (
|
150 |
+
not preserve_segments and timing["start"] - last > 3.0
|
151 |
+
)
|
152 |
+
has_room = line_len + len(timing["word"]) <= max_line_width
|
153 |
+
seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
|
154 |
+
if (
|
155 |
+
line_len > 0
|
156 |
+
and has_room
|
157 |
+
and not long_pause
|
158 |
+
and not seg_break
|
159 |
+
):
|
160 |
+
# line continuation
|
161 |
+
line_len += len(timing["word"])
|
162 |
+
else:
|
163 |
+
# new line
|
164 |
+
timing["word"] = timing["word"].strip()
|
165 |
+
if (
|
166 |
+
len(subtitle) > 0
|
167 |
+
and max_line_count is not None
|
168 |
+
and (long_pause or line_count >= max_line_count)
|
169 |
+
or seg_break
|
170 |
+
):
|
171 |
+
# subtitle break
|
172 |
+
yield subtitle
|
173 |
+
subtitle = []
|
174 |
+
line_count = 1
|
175 |
+
elif line_len > 0:
|
176 |
+
# line break
|
177 |
+
line_count += 1
|
178 |
+
timing["word"] = "\n" + timing["word"]
|
179 |
+
line_len = len(timing["word"].strip())
|
180 |
+
subtitle.append(timing)
|
181 |
+
last = timing["start"]
|
182 |
+
chunk_index += max_words_per_line
|
183 |
+
if len(subtitle) > 0:
|
184 |
+
yield subtitle
|
185 |
+
|
186 |
+
if len(result["segments"]) > 0 and "words" in result["segments"][0] and result["segments"][0]["words"]:
|
187 |
+
for subtitle in iterate_subtitles():
|
188 |
+
subtitle_start = self.format_timestamp(subtitle[0]["start"])
|
189 |
+
subtitle_end = self.format_timestamp(subtitle[-1]["end"])
|
190 |
+
subtitle_text = "".join([word["word"] for word in subtitle])
|
191 |
+
if highlight_words:
|
192 |
+
last = subtitle_start
|
193 |
+
all_words = [timing["word"] for timing in subtitle]
|
194 |
+
for i, this_word in enumerate(subtitle):
|
195 |
+
start = self.format_timestamp(this_word["start"])
|
196 |
+
end = self.format_timestamp(this_word["end"])
|
197 |
+
if last != start:
|
198 |
+
yield last, start, subtitle_text
|
199 |
+
|
200 |
+
yield start, end, "".join(
|
201 |
+
[
|
202 |
+
re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
|
203 |
+
if j == i
|
204 |
+
else word
|
205 |
+
for j, word in enumerate(all_words)
|
206 |
+
]
|
207 |
+
)
|
208 |
+
last = end
|
209 |
+
|
210 |
+
if align_lrc_words:
|
211 |
+
lrc_aligned_words = [f"[{self.format_timestamp(sub['start'])}]{sub['word']}" for sub in subtitle]
|
212 |
+
l_start, l_end = self.format_timestamp(subtitle[-1]['start']), self.format_timestamp(subtitle[-1]['end'])
|
213 |
+
lrc_aligned_words[-1] = f"[{l_start}]{subtitle[-1]['word']}[{l_end}]"
|
214 |
+
lrc_aligned_words = ' '.join(lrc_aligned_words)
|
215 |
+
yield None, None, lrc_aligned_words
|
216 |
+
|
217 |
+
else:
|
218 |
+
yield subtitle_start, subtitle_end, subtitle_text
|
219 |
+
else:
|
220 |
+
for segment in result["segments"]:
|
221 |
+
segment_start = self.format_timestamp(segment["start"])
|
222 |
+
segment_end = self.format_timestamp(segment["end"])
|
223 |
+
segment_text = segment["text"].strip().replace("-->", "->")
|
224 |
+
yield segment_start, segment_end, segment_text
|
225 |
+
|
226 |
+
def format_timestamp(self, seconds: float):
|
227 |
+
return format_timestamp(
|
228 |
+
seconds=seconds,
|
229 |
+
always_include_hours=self.always_include_hours,
|
230 |
+
decimal_marker=self.decimal_marker,
|
231 |
+
)
|
232 |
+
|
233 |
+
|
234 |
+
class WriteVTT(SubtitlesWriter):
|
235 |
+
extension: str = "vtt"
|
236 |
+
always_include_hours: bool = False
|
237 |
+
decimal_marker: str = "."
|
238 |
+
|
239 |
+
def write_result(
|
240 |
+
self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
|
241 |
+
):
|
242 |
+
print("WEBVTT\n", file=file)
|
243 |
+
for start, end, text in self.iterate_result(result, options, **kwargs):
|
244 |
+
print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
|
245 |
+
|
246 |
+
def to_segments(self, file_path: str) -> List[Segment]:
|
247 |
+
segments = []
|
248 |
+
|
249 |
+
blocks = read_file(file_path).split('\n\n')
|
250 |
+
|
251 |
+
for block in blocks:
|
252 |
+
if block.strip() != '' and not block.strip().startswith("WEBVTT"):
|
253 |
+
lines = block.strip().split('\n')
|
254 |
+
time_line = lines[0].split(" --> ")
|
255 |
+
start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
|
256 |
+
sentence = ' '.join(lines[1:])
|
257 |
+
|
258 |
+
segments.append(Segment(
|
259 |
+
start=start,
|
260 |
+
end=end,
|
261 |
+
text=sentence
|
262 |
+
))
|
263 |
+
|
264 |
+
return segments
|
265 |
+
|
266 |
+
|
267 |
+
class WriteSRT(SubtitlesWriter):
|
268 |
+
extension: str = "srt"
|
269 |
+
always_include_hours: bool = True
|
270 |
+
decimal_marker: str = ","
|
271 |
+
|
272 |
+
def write_result(
|
273 |
+
self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
|
274 |
+
):
|
275 |
+
for i, (start, end, text) in enumerate(
|
276 |
+
self.iterate_result(result, options, **kwargs), start=1
|
277 |
+
):
|
278 |
+
print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
|
279 |
+
|
280 |
+
def to_segments(self, file_path: str) -> List[Segment]:
|
281 |
+
segments = []
|
282 |
+
|
283 |
+
blocks = read_file(file_path).split('\n\n')
|
284 |
+
|
285 |
+
for block in blocks:
|
286 |
+
if block.strip() != '':
|
287 |
+
lines = block.strip().split('\n')
|
288 |
+
index = lines[0]
|
289 |
+
time_line = lines[1].split(" --> ")
|
290 |
+
start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
|
291 |
+
sentence = ' '.join(lines[2:])
|
292 |
+
|
293 |
+
segments.append(Segment(
|
294 |
+
start=start,
|
295 |
+
end=end,
|
296 |
+
text=sentence
|
297 |
+
))
|
298 |
+
|
299 |
+
return segments
|
300 |
+
|
301 |
+
|
302 |
+
class WriteLRC(SubtitlesWriter):
|
303 |
+
extension: str = "lrc"
|
304 |
+
always_include_hours: bool = False
|
305 |
+
decimal_marker: str = "."
|
306 |
+
|
307 |
+
def write_result(
|
308 |
+
self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
|
309 |
+
):
|
310 |
+
for i, (start, end, text) in enumerate(
|
311 |
+
self.iterate_result(result, options, **kwargs), start=1
|
312 |
+
):
|
313 |
+
if "align_lrc_words" in kwargs and kwargs["align_lrc_words"]:
|
314 |
+
print(f"{text}\n", file=file, flush=True)
|
315 |
+
else:
|
316 |
+
print(f"[{start}]{text}[{end}]\n", file=file, flush=True)
|
317 |
+
|
318 |
+
def to_segments(self, file_path: str) -> List[Segment]:
|
319 |
+
segments = []
|
320 |
+
|
321 |
+
blocks = read_file(file_path).split('\n')
|
322 |
+
|
323 |
+
for block in blocks:
|
324 |
+
if block.strip() != '':
|
325 |
+
lines = block.strip()
|
326 |
+
pattern = r'(\[.*?\])'
|
327 |
+
parts = re.split(pattern, lines)
|
328 |
+
parts = [part.strip() for part in parts if part]
|
329 |
+
|
330 |
+
for i, part in enumerate(parts):
|
331 |
+
sentence_i = i%2
|
332 |
+
if sentence_i == 1:
|
333 |
+
start_str, text, end_str = parts[sentence_i-1], parts[sentence_i], parts[sentence_i+1]
|
334 |
+
start_str, end_str = start_str.replace("[", "").replace("]", ""), end_str.replace("[", "").replace("]", "")
|
335 |
+
start, end = time_str_to_seconds(start_str, self.decimal_marker), time_str_to_seconds(end_str, self.decimal_marker)
|
336 |
+
|
337 |
+
segments.append(Segment(
|
338 |
+
start=start,
|
339 |
+
end=end,
|
340 |
+
text=text,
|
341 |
+
))
|
342 |
+
|
343 |
+
return segments
|
344 |
+
|
345 |
+
|
346 |
+
class WriteTSV(ResultWriter):
|
347 |
+
"""
|
348 |
+
Write a transcript to a file in TSV (tab-separated values) format containing lines like:
|
349 |
+
<start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
|
350 |
+
|
351 |
+
Using integer milliseconds as start and end times means there's no chance of interference from
|
352 |
+
an environment setting a language encoding that causes the decimal in a floating point number
|
353 |
+
to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
|
354 |
+
"""
|
355 |
+
|
356 |
+
extension: str = "tsv"
|
357 |
+
|
358 |
+
def write_result(
|
359 |
+
self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
|
360 |
+
):
|
361 |
+
print("start", "end", "text", sep="\t", file=file)
|
362 |
+
for segment in result["segments"]:
|
363 |
+
print(round(1000 * segment["start"]), file=file, end="\t")
|
364 |
+
print(round(1000 * segment["end"]), file=file, end="\t")
|
365 |
+
print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
|
366 |
+
|
367 |
+
|
368 |
+
class WriteJSON(ResultWriter):
|
369 |
+
extension: str = "json"
|
370 |
+
|
371 |
+
def write_result(
|
372 |
+
self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
|
373 |
+
):
|
374 |
+
json.dump(result, file)
|
375 |
+
|
376 |
+
|
377 |
+
def get_writer(
|
378 |
+
output_format: str, output_dir: str
|
379 |
+
) -> Callable[[dict, TextIO, dict], None]:
|
380 |
+
output_format = output_format.strip().lower().replace(".", "")
|
381 |
+
|
382 |
+
writers = {
|
383 |
+
"txt": WriteTXT,
|
384 |
+
"vtt": WriteVTT,
|
385 |
+
"srt": WriteSRT,
|
386 |
+
"tsv": WriteTSV,
|
387 |
+
"json": WriteJSON,
|
388 |
+
"lrc": WriteLRC
|
389 |
+
}
|
390 |
+
|
391 |
+
if output_format == "all":
|
392 |
+
all_writers = [writer(output_dir) for writer in writers.values()]
|
393 |
+
|
394 |
+
def write_all(
|
395 |
+
result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
|
396 |
+
):
|
397 |
+
for writer in all_writers:
|
398 |
+
writer(result, file, options, **kwargs)
|
399 |
+
|
400 |
+
return write_all
|
401 |
+
|
402 |
+
return writers[output_format](output_dir)
|
403 |
+
|
404 |
+
|
405 |
+
def generate_file(
|
406 |
+
output_format: str, output_dir: str, result: Union[dict, List[Segment]], output_file_name: str,
|
407 |
+
add_timestamp: bool = True, **kwargs
|
408 |
+
) -> Tuple[str, str]:
|
409 |
+
output_format = output_format.strip().lower().replace(".", "")
|
410 |
+
output_format = "vtt" if output_format == "webvtt" else output_format
|
411 |
+
|
412 |
+
if add_timestamp:
|
413 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
414 |
+
output_file_name += f"-{timestamp}"
|
415 |
+
|
416 |
+
file_path = os.path.join(output_dir, f"{output_file_name}.{output_format}")
|
417 |
+
file_writer = get_writer(output_format=output_format, output_dir=output_dir)
|
418 |
+
|
419 |
+
if isinstance(file_writer, WriteLRC) and kwargs.get("highlight_words", False):
|
420 |
+
kwargs["highlight_words"], kwargs["align_lrc_words"] = False, True
|
421 |
+
|
422 |
+
file_writer(result=result, output_file_name=output_file_name, **kwargs)
|
423 |
+
content = read_file(file_path)
|
424 |
+
return content, file_path
|
425 |
|
426 |
@spaces.GPU(duration=120)
|
427 |
def safe_filename(name):
|
modules/vad/silero_vad.py
CHANGED
@@ -5,7 +5,8 @@ import numpy as np
|
|
5 |
from typing import BinaryIO, Union, List, Optional, Tuple
|
6 |
import warnings
|
7 |
import faster_whisper
|
8 |
-
from
|
|
|
9 |
import gradio as gr
|
10 |
|
11 |
|
@@ -247,18 +248,18 @@ class SileroVAD:
|
|
247 |
|
248 |
def restore_speech_timestamps(
|
249 |
self,
|
250 |
-
segments: List[
|
251 |
speech_chunks: List[dict],
|
252 |
sampling_rate: Optional[int] = None,
|
253 |
-
) -> List[
|
254 |
if sampling_rate is None:
|
255 |
sampling_rate = self.sampling_rate
|
256 |
|
257 |
ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
|
258 |
|
259 |
for segment in segments:
|
260 |
-
segment
|
261 |
-
segment
|
262 |
|
263 |
return segments
|
264 |
|
|
|
5 |
from typing import BinaryIO, Union, List, Optional, Tuple
|
6 |
import warnings
|
7 |
import faster_whisper
|
8 |
+
from modules.whisper.data_classes import *
|
9 |
+
from faster_whisper.transcribe import SpeechTimestampsMap
|
10 |
import gradio as gr
|
11 |
|
12 |
|
|
|
248 |
|
249 |
def restore_speech_timestamps(
|
250 |
self,
|
251 |
+
segments: List[Segment],
|
252 |
speech_chunks: List[dict],
|
253 |
sampling_rate: Optional[int] = None,
|
254 |
+
) -> List[Segment]:
|
255 |
if sampling_rate is None:
|
256 |
sampling_rate = self.sampling_rate
|
257 |
|
258 |
ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
|
259 |
|
260 |
for segment in segments:
|
261 |
+
segment.start = ts_map.get_original_time(segment.start)
|
262 |
+
segment.end = ts_map.get_original_time(segment.end)
|
263 |
|
264 |
return segments
|
265 |
|
modules/whisper/{whisper_base.py → base_transcription_pipeline.py}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
-
import torch
|
3 |
import whisper
|
|
|
4 |
import gradio as gr
|
5 |
import torchaudio
|
6 |
from abc import ABC, abstractmethod
|
@@ -8,20 +8,20 @@ from typing import BinaryIO, Union, Tuple, List
|
|
8 |
import numpy as np
|
9 |
from datetime import datetime
|
10 |
from faster_whisper.vad import VadOptions
|
11 |
-
from dataclasses import astuple
|
12 |
|
13 |
from modules.uvr.music_separator import MusicSeparator
|
14 |
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
15 |
UVR_MODELS_DIR)
|
16 |
-
from modules.utils.
|
|
|
17 |
from modules.utils.youtube_manager import get_ytdata, get_ytaudio
|
18 |
-
from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
|
19 |
-
from modules.whisper.
|
20 |
from modules.diarize.diarizer import Diarizer
|
21 |
from modules.vad.silero_vad import SileroVAD
|
22 |
|
23 |
|
24 |
-
class
|
25 |
def __init__(self,
|
26 |
model_dir: str = WHISPER_MODELS_DIR,
|
27 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
@@ -47,8 +47,8 @@ class WhisperBase(ABC):
|
|
47 |
self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
|
48 |
self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
|
49 |
self.device = self.get_device()
|
50 |
-
self.available_compute_types =
|
51 |
-
self.current_compute_type =
|
52 |
|
53 |
@abstractmethod
|
54 |
def transcribe(self,
|
@@ -71,13 +71,15 @@ class WhisperBase(ABC):
|
|
71 |
def run(self,
|
72 |
audio: Union[str, BinaryIO, np.ndarray],
|
73 |
progress: gr.Progress = gr.Progress(),
|
|
|
74 |
add_timestamp: bool = True,
|
75 |
-
*
|
76 |
-
) -> Tuple[List[
|
77 |
"""
|
78 |
Run transcription with conditional pre-processing and post-processing.
|
79 |
The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
|
80 |
The diarization will be performed in post-processing, if enabled.
|
|
|
81 |
|
82 |
Parameters
|
83 |
----------
|
@@ -85,40 +87,33 @@ class WhisperBase(ABC):
|
|
85 |
Audio input. This can be file path or binary type.
|
86 |
progress: gr.Progress
|
87 |
Indicator to show progress directly in gradio.
|
|
|
|
|
88 |
add_timestamp: bool
|
89 |
Whether to add a timestamp at the end of the filename.
|
90 |
-
*
|
91 |
-
Parameters
|
|
|
|
|
92 |
|
93 |
Returns
|
94 |
----------
|
95 |
-
segments_result: List[
|
96 |
-
list of
|
97 |
elapsed_time: float
|
98 |
elapsed time for running
|
99 |
"""
|
100 |
-
params =
|
|
|
|
|
101 |
|
102 |
-
|
103 |
-
whisper_params=params,
|
104 |
-
add_timestamp=add_timestamp
|
105 |
-
)
|
106 |
-
|
107 |
-
if params.lang is None:
|
108 |
-
pass
|
109 |
-
elif params.lang == "Automatic Detection":
|
110 |
-
params.lang = None
|
111 |
-
else:
|
112 |
-
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
113 |
-
params.lang = language_code_dict[params.lang]
|
114 |
-
|
115 |
-
if params.is_bgm_separate:
|
116 |
music, audio, _ = self.music_separator.separate(
|
117 |
audio=audio,
|
118 |
-
model_name=
|
119 |
-
device=
|
120 |
-
segment_size=
|
121 |
-
save_file=
|
122 |
progress=progress
|
123 |
)
|
124 |
|
@@ -130,47 +125,55 @@ class WhisperBase(ABC):
|
|
130 |
origin_sample_rate = self.music_separator.audio_info.sample_rate
|
131 |
audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
|
132 |
|
133 |
-
if
|
134 |
self.music_separator.offload()
|
135 |
|
136 |
-
if
|
137 |
-
# Explicit value set for float('inf') from gr.Number()
|
138 |
-
if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
|
139 |
-
params.max_speech_duration_s = float('inf')
|
140 |
-
|
141 |
vad_options = VadOptions(
|
142 |
-
threshold=
|
143 |
-
min_speech_duration_ms=
|
144 |
-
max_speech_duration_s=
|
145 |
-
min_silence_duration_ms=
|
146 |
-
speech_pad_ms=
|
147 |
)
|
148 |
|
149 |
-
|
150 |
audio=audio,
|
151 |
vad_parameters=vad_options,
|
152 |
progress=progress
|
153 |
)
|
154 |
|
|
|
|
|
|
|
|
|
|
|
155 |
result, elapsed_time = self.transcribe(
|
156 |
audio,
|
157 |
progress,
|
158 |
-
*
|
159 |
)
|
160 |
|
161 |
-
if
|
162 |
result = self.vad.restore_speech_timestamps(
|
163 |
segments=result,
|
164 |
speech_chunks=speech_chunks,
|
165 |
)
|
166 |
|
167 |
-
if
|
168 |
result, elapsed_time_diarization = self.diarizer.run(
|
169 |
audio=audio,
|
170 |
-
use_auth_token=
|
171 |
transcribed_result=result,
|
|
|
172 |
)
|
173 |
elapsed_time += elapsed_time_diarization
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
return result, elapsed_time
|
175 |
|
176 |
def transcribe_file(self,
|
@@ -179,8 +182,8 @@ class WhisperBase(ABC):
|
|
179 |
file_format: str = "SRT",
|
180 |
add_timestamp: bool = True,
|
181 |
progress=gr.Progress(),
|
182 |
-
*
|
183 |
-
) ->
|
184 |
"""
|
185 |
Write subtitle file from Files
|
186 |
|
@@ -197,8 +200,8 @@ class WhisperBase(ABC):
|
|
197 |
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
|
198 |
progress: gr.Progress
|
199 |
Indicator to show progress directly in gradio.
|
200 |
-
*
|
201 |
-
Parameters
|
202 |
|
203 |
Returns
|
204 |
----------
|
@@ -208,6 +211,11 @@ class WhisperBase(ABC):
|
|
208 |
Output file path to return to gr.Files()
|
209 |
"""
|
210 |
try:
|
|
|
|
|
|
|
|
|
|
|
211 |
if input_folder_path:
|
212 |
files = get_media_files(input_folder_path)
|
213 |
if isinstance(files, str):
|
@@ -220,19 +228,21 @@ class WhisperBase(ABC):
|
|
220 |
transcribed_segments, time_for_task = self.run(
|
221 |
file,
|
222 |
progress,
|
|
|
223 |
add_timestamp,
|
224 |
-
*
|
225 |
)
|
226 |
|
227 |
file_name, file_ext = os.path.splitext(os.path.basename(file))
|
228 |
-
subtitle, file_path =
|
229 |
-
|
230 |
-
|
|
|
|
|
231 |
add_timestamp=add_timestamp,
|
232 |
-
|
233 |
-
output_dir=self.output_dir
|
234 |
)
|
235 |
-
files_info[file_name] = {"subtitle":
|
236 |
|
237 |
total_result = ''
|
238 |
total_time = 0
|
@@ -245,10 +255,11 @@ class WhisperBase(ABC):
|
|
245 |
result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
|
246 |
result_file_path = [info['path'] for info in files_info.values()]
|
247 |
|
248 |
-
return
|
249 |
|
250 |
except Exception as e:
|
251 |
print(f"Error transcribing file: {e}")
|
|
|
252 |
finally:
|
253 |
self.release_cuda_memory()
|
254 |
|
@@ -257,8 +268,8 @@ class WhisperBase(ABC):
|
|
257 |
file_format: str = "SRT",
|
258 |
add_timestamp: bool = True,
|
259 |
progress=gr.Progress(),
|
260 |
-
*
|
261 |
-
) ->
|
262 |
"""
|
263 |
Write subtitle file from microphone
|
264 |
|
@@ -272,7 +283,7 @@ class WhisperBase(ABC):
|
|
272 |
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
273 |
progress: gr.Progress
|
274 |
Indicator to show progress directly in gradio.
|
275 |
-
*
|
276 |
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
277 |
|
278 |
Returns
|
@@ -283,27 +294,36 @@ class WhisperBase(ABC):
|
|
283 |
Output file path to return to gr.Files()
|
284 |
"""
|
285 |
try:
|
|
|
|
|
|
|
|
|
|
|
286 |
progress(0, desc="Loading Audio..")
|
287 |
transcribed_segments, time_for_task = self.run(
|
288 |
mic_audio,
|
289 |
progress,
|
|
|
290 |
add_timestamp,
|
291 |
-
*
|
292 |
)
|
293 |
progress(1, desc="Completed!")
|
294 |
|
295 |
-
|
296 |
-
|
297 |
-
|
|
|
|
|
|
|
298 |
add_timestamp=add_timestamp,
|
299 |
-
|
300 |
-
output_dir=self.output_dir
|
301 |
)
|
302 |
|
303 |
result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
304 |
-
return
|
305 |
except Exception as e:
|
306 |
-
print(f"Error transcribing
|
|
|
307 |
finally:
|
308 |
self.release_cuda_memory()
|
309 |
|
@@ -312,8 +332,8 @@ class WhisperBase(ABC):
|
|
312 |
file_format: str = "SRT",
|
313 |
add_timestamp: bool = True,
|
314 |
progress=gr.Progress(),
|
315 |
-
*
|
316 |
-
) ->
|
317 |
"""
|
318 |
Write subtitle file from Youtube
|
319 |
|
@@ -327,7 +347,7 @@ class WhisperBase(ABC):
|
|
327 |
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
328 |
progress: gr.Progress
|
329 |
Indicator to show progress directly in gradio.
|
330 |
-
*
|
331 |
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
332 |
|
333 |
Returns
|
@@ -338,6 +358,11 @@ class WhisperBase(ABC):
|
|
338 |
Output file path to return to gr.Files()
|
339 |
"""
|
340 |
try:
|
|
|
|
|
|
|
|
|
|
|
341 |
progress(0, desc="Loading Audio from Youtube..")
|
342 |
yt = get_ytdata(youtube_link)
|
343 |
audio = get_ytaudio(yt)
|
@@ -345,83 +370,49 @@ class WhisperBase(ABC):
|
|
345 |
transcribed_segments, time_for_task = self.run(
|
346 |
audio,
|
347 |
progress,
|
|
|
348 |
add_timestamp,
|
349 |
-
*
|
350 |
)
|
351 |
|
352 |
progress(1, desc="Completed!")
|
353 |
|
354 |
file_name = safe_filename(yt.title)
|
355 |
-
subtitle,
|
356 |
-
|
357 |
-
|
|
|
|
|
358 |
add_timestamp=add_timestamp,
|
359 |
-
|
360 |
-
output_dir=self.output_dir
|
361 |
)
|
|
|
362 |
result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
363 |
|
364 |
if os.path.exists(audio):
|
365 |
os.remove(audio)
|
366 |
|
367 |
-
return
|
368 |
|
369 |
except Exception as e:
|
370 |
-
print(f"Error transcribing
|
|
|
371 |
finally:
|
372 |
self.release_cuda_memory()
|
373 |
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
output_dir: str
|
380 |
-
) -> str:
|
381 |
-
"""
|
382 |
-
Writes subtitle file
|
383 |
-
|
384 |
-
Parameters
|
385 |
-
----------
|
386 |
-
file_name: str
|
387 |
-
Output file name
|
388 |
-
transcribed_segments: list
|
389 |
-
Text segments transcribed from audio
|
390 |
-
add_timestamp: bool
|
391 |
-
Determines whether to add a timestamp to the end of the filename.
|
392 |
-
file_format: str
|
393 |
-
File format to write. Supported formats: [SRT, WebVTT, txt]
|
394 |
-
output_dir: str
|
395 |
-
Directory path of the output
|
396 |
-
|
397 |
-
Returns
|
398 |
-
----------
|
399 |
-
content: str
|
400 |
-
Result of the transcription
|
401 |
-
output_path: str
|
402 |
-
output file path
|
403 |
-
"""
|
404 |
-
if add_timestamp:
|
405 |
-
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
406 |
-
output_path = os.path.join(output_dir, f"{file_name}-{timestamp}")
|
407 |
else:
|
408 |
-
|
409 |
-
|
410 |
-
file_format = file_format.strip().lower()
|
411 |
-
if file_format == "srt":
|
412 |
-
content = get_srt(transcribed_segments)
|
413 |
-
output_path += '.srt'
|
414 |
-
|
415 |
-
elif file_format == "webvtt":
|
416 |
-
content = get_vtt(transcribed_segments)
|
417 |
-
output_path += '.vtt'
|
418 |
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
return content, output_path
|
425 |
|
426 |
@staticmethod
|
427 |
def format_time(elapsed_time: float) -> str:
|
@@ -455,7 +446,7 @@ class WhisperBase(ABC):
|
|
455 |
if torch.cuda.is_available():
|
456 |
return "cuda"
|
457 |
elif torch.backends.mps.is_available():
|
458 |
-
if not
|
459 |
# Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
|
460 |
return "cpu"
|
461 |
return "mps"
|
@@ -496,18 +487,65 @@ class WhisperBase(ABC):
|
|
496 |
if file_path and os.path.exists(file_path):
|
497 |
os.remove(file_path)
|
498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
@staticmethod
|
500 |
def cache_parameters(
|
501 |
-
|
502 |
-
|
|
|
503 |
):
|
504 |
-
"""
|
505 |
cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
506 |
-
|
507 |
-
|
|
|
508 |
cached_yaml["whisper"]["add_timestamp"] = add_timestamp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
|
510 |
-
|
|
|
511 |
|
512 |
@staticmethod
|
513 |
def resample_audio(audio: Union[str, np.ndarray],
|
|
|
1 |
import os
|
|
|
2 |
import whisper
|
3 |
+
import ctranslate2
|
4 |
import gradio as gr
|
5 |
import torchaudio
|
6 |
from abc import ABC, abstractmethod
|
|
|
8 |
import numpy as np
|
9 |
from datetime import datetime
|
10 |
from faster_whisper.vad import VadOptions
|
|
|
11 |
|
12 |
from modules.uvr.music_separator import MusicSeparator
|
13 |
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
14 |
UVR_MODELS_DIR)
|
15 |
+
from modules.utils.constants import *
|
16 |
+
from modules.utils.subtitle_manager import *
|
17 |
from modules.utils.youtube_manager import get_ytdata, get_ytaudio
|
18 |
+
from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml, read_file
|
19 |
+
from modules.whisper.data_classes import *
|
20 |
from modules.diarize.diarizer import Diarizer
|
21 |
from modules.vad.silero_vad import SileroVAD
|
22 |
|
23 |
|
24 |
+
class BaseTranscriptionPipeline(ABC):
|
25 |
def __init__(self,
|
26 |
model_dir: str = WHISPER_MODELS_DIR,
|
27 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
|
|
47 |
self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
|
48 |
self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
|
49 |
self.device = self.get_device()
|
50 |
+
self.available_compute_types = self.get_available_compute_type()
|
51 |
+
self.current_compute_type = self.get_compute_type()
|
52 |
|
53 |
@abstractmethod
|
54 |
def transcribe(self,
|
|
|
71 |
def run(self,
|
72 |
audio: Union[str, BinaryIO, np.ndarray],
|
73 |
progress: gr.Progress = gr.Progress(),
|
74 |
+
file_format: str = "SRT",
|
75 |
add_timestamp: bool = True,
|
76 |
+
*pipeline_params,
|
77 |
+
) -> Tuple[List[Segment], float]:
|
78 |
"""
|
79 |
Run transcription with conditional pre-processing and post-processing.
|
80 |
The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
|
81 |
The diarization will be performed in post-processing, if enabled.
|
82 |
+
Due to the integration with gradio, the parameters have to be specified with a `*` wildcard.
|
83 |
|
84 |
Parameters
|
85 |
----------
|
|
|
87 |
Audio input. This can be file path or binary type.
|
88 |
progress: gr.Progress
|
89 |
Indicator to show progress directly in gradio.
|
90 |
+
file_format: str
|
91 |
+
Subtitle file format between ["SRT", "WebVTT", "txt", "lrc"]
|
92 |
add_timestamp: bool
|
93 |
Whether to add a timestamp at the end of the filename.
|
94 |
+
*pipeline_params: tuple
|
95 |
+
Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class.
|
96 |
+
This must be provided as a List with * wildcard because of the integration with gradio.
|
97 |
+
See more info at : https://github.com/gradio-app/gradio/issues/2471
|
98 |
|
99 |
Returns
|
100 |
----------
|
101 |
+
segments_result: List[Segment]
|
102 |
+
list of Segment that includes start, end timestamps and transcribed text
|
103 |
elapsed_time: float
|
104 |
elapsed time for running
|
105 |
"""
|
106 |
+
params = TranscriptionPipelineParams.from_list(list(pipeline_params))
|
107 |
+
params = self.validate_gradio_values(params)
|
108 |
+
bgm_params, vad_params, whisper_params, diarization_params = params.bgm_separation, params.vad, params.whisper, params.diarization
|
109 |
|
110 |
+
if bgm_params.is_separate_bgm:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
music, audio, _ = self.music_separator.separate(
|
112 |
audio=audio,
|
113 |
+
model_name=bgm_params.model_size,
|
114 |
+
device=bgm_params.device,
|
115 |
+
segment_size=bgm_params.segment_size,
|
116 |
+
save_file=bgm_params.save_file,
|
117 |
progress=progress
|
118 |
)
|
119 |
|
|
|
125 |
origin_sample_rate = self.music_separator.audio_info.sample_rate
|
126 |
audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
|
127 |
|
128 |
+
if bgm_params.enable_offload:
|
129 |
self.music_separator.offload()
|
130 |
|
131 |
+
if vad_params.vad_filter:
|
|
|
|
|
|
|
|
|
132 |
vad_options = VadOptions(
|
133 |
+
threshold=vad_params.threshold,
|
134 |
+
min_speech_duration_ms=vad_params.min_speech_duration_ms,
|
135 |
+
max_speech_duration_s=vad_params.max_speech_duration_s,
|
136 |
+
min_silence_duration_ms=vad_params.min_silence_duration_ms,
|
137 |
+
speech_pad_ms=vad_params.speech_pad_ms
|
138 |
)
|
139 |
|
140 |
+
vad_processed, speech_chunks = self.vad.run(
|
141 |
audio=audio,
|
142 |
vad_parameters=vad_options,
|
143 |
progress=progress
|
144 |
)
|
145 |
|
146 |
+
if vad_processed.size > 0:
|
147 |
+
audio = vad_processed
|
148 |
+
else:
|
149 |
+
vad_params.vad_filter = False
|
150 |
+
|
151 |
result, elapsed_time = self.transcribe(
|
152 |
audio,
|
153 |
progress,
|
154 |
+
*whisper_params.to_list()
|
155 |
)
|
156 |
|
157 |
+
if vad_params.vad_filter:
|
158 |
result = self.vad.restore_speech_timestamps(
|
159 |
segments=result,
|
160 |
speech_chunks=speech_chunks,
|
161 |
)
|
162 |
|
163 |
+
if diarization_params.is_diarize:
|
164 |
result, elapsed_time_diarization = self.diarizer.run(
|
165 |
audio=audio,
|
166 |
+
use_auth_token=diarization_params.hf_token,
|
167 |
transcribed_result=result,
|
168 |
+
device=diarization_params.device
|
169 |
)
|
170 |
elapsed_time += elapsed_time_diarization
|
171 |
+
|
172 |
+
self.cache_parameters(
|
173 |
+
params=params,
|
174 |
+
file_format=file_format,
|
175 |
+
add_timestamp=add_timestamp
|
176 |
+
)
|
177 |
return result, elapsed_time
|
178 |
|
179 |
def transcribe_file(self,
|
|
|
182 |
file_format: str = "SRT",
|
183 |
add_timestamp: bool = True,
|
184 |
progress=gr.Progress(),
|
185 |
+
*pipeline_params,
|
186 |
+
) -> Tuple[str, List]:
|
187 |
"""
|
188 |
Write subtitle file from Files
|
189 |
|
|
|
200 |
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
|
201 |
progress: gr.Progress
|
202 |
Indicator to show progress directly in gradio.
|
203 |
+
*pipeline_params: tuple
|
204 |
+
Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class
|
205 |
|
206 |
Returns
|
207 |
----------
|
|
|
211 |
Output file path to return to gr.Files()
|
212 |
"""
|
213 |
try:
|
214 |
+
params = TranscriptionPipelineParams.from_list(list(pipeline_params))
|
215 |
+
writer_options = {
|
216 |
+
"highlight_words": True if params.whisper.word_timestamps else False
|
217 |
+
}
|
218 |
+
|
219 |
if input_folder_path:
|
220 |
files = get_media_files(input_folder_path)
|
221 |
if isinstance(files, str):
|
|
|
228 |
transcribed_segments, time_for_task = self.run(
|
229 |
file,
|
230 |
progress,
|
231 |
+
file_format,
|
232 |
add_timestamp,
|
233 |
+
*pipeline_params,
|
234 |
)
|
235 |
|
236 |
file_name, file_ext = os.path.splitext(os.path.basename(file))
|
237 |
+
subtitle, file_path = generate_file(
|
238 |
+
output_dir=self.output_dir,
|
239 |
+
output_file_name=file_name,
|
240 |
+
output_format=file_format,
|
241 |
+
result=transcribed_segments,
|
242 |
add_timestamp=add_timestamp,
|
243 |
+
**writer_options
|
|
|
244 |
)
|
245 |
+
files_info[file_name] = {"subtitle": read_file(file_path), "time_for_task": time_for_task, "path": file_path}
|
246 |
|
247 |
total_result = ''
|
248 |
total_time = 0
|
|
|
255 |
result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
|
256 |
result_file_path = [info['path'] for info in files_info.values()]
|
257 |
|
258 |
+
return result_str, result_file_path
|
259 |
|
260 |
except Exception as e:
|
261 |
print(f"Error transcribing file: {e}")
|
262 |
+
raise
|
263 |
finally:
|
264 |
self.release_cuda_memory()
|
265 |
|
|
|
268 |
file_format: str = "SRT",
|
269 |
add_timestamp: bool = True,
|
270 |
progress=gr.Progress(),
|
271 |
+
*pipeline_params,
|
272 |
+
) -> Tuple[str, str]:
|
273 |
"""
|
274 |
Write subtitle file from microphone
|
275 |
|
|
|
283 |
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
284 |
progress: gr.Progress
|
285 |
Indicator to show progress directly in gradio.
|
286 |
+
*pipeline_params: tuple
|
287 |
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
288 |
|
289 |
Returns
|
|
|
294 |
Output file path to return to gr.Files()
|
295 |
"""
|
296 |
try:
|
297 |
+
params = TranscriptionPipelineParams.from_list(list(pipeline_params))
|
298 |
+
writer_options = {
|
299 |
+
"highlight_words": True if params.whisper.word_timestamps else False
|
300 |
+
}
|
301 |
+
|
302 |
progress(0, desc="Loading Audio..")
|
303 |
transcribed_segments, time_for_task = self.run(
|
304 |
mic_audio,
|
305 |
progress,
|
306 |
+
file_format,
|
307 |
add_timestamp,
|
308 |
+
*pipeline_params,
|
309 |
)
|
310 |
progress(1, desc="Completed!")
|
311 |
|
312 |
+
file_name = "Mic"
|
313 |
+
subtitle, file_path = generate_file(
|
314 |
+
output_dir=self.output_dir,
|
315 |
+
output_file_name=file_name,
|
316 |
+
output_format=file_format,
|
317 |
+
result=transcribed_segments,
|
318 |
add_timestamp=add_timestamp,
|
319 |
+
**writer_options
|
|
|
320 |
)
|
321 |
|
322 |
result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
323 |
+
return result_str, file_path
|
324 |
except Exception as e:
|
325 |
+
print(f"Error transcribing mic: {e}")
|
326 |
+
raise
|
327 |
finally:
|
328 |
self.release_cuda_memory()
|
329 |
|
|
|
332 |
file_format: str = "SRT",
|
333 |
add_timestamp: bool = True,
|
334 |
progress=gr.Progress(),
|
335 |
+
*pipeline_params,
|
336 |
+
) -> Tuple[str, str]:
|
337 |
"""
|
338 |
Write subtitle file from Youtube
|
339 |
|
|
|
347 |
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
348 |
progress: gr.Progress
|
349 |
Indicator to show progress directly in gradio.
|
350 |
+
*pipeline_params: tuple
|
351 |
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
352 |
|
353 |
Returns
|
|
|
358 |
Output file path to return to gr.Files()
|
359 |
"""
|
360 |
try:
|
361 |
+
params = TranscriptionPipelineParams.from_list(list(pipeline_params))
|
362 |
+
writer_options = {
|
363 |
+
"highlight_words": True if params.whisper.word_timestamps else False
|
364 |
+
}
|
365 |
+
|
366 |
progress(0, desc="Loading Audio from Youtube..")
|
367 |
yt = get_ytdata(youtube_link)
|
368 |
audio = get_ytaudio(yt)
|
|
|
370 |
transcribed_segments, time_for_task = self.run(
|
371 |
audio,
|
372 |
progress,
|
373 |
+
file_format,
|
374 |
add_timestamp,
|
375 |
+
*pipeline_params,
|
376 |
)
|
377 |
|
378 |
progress(1, desc="Completed!")
|
379 |
|
380 |
file_name = safe_filename(yt.title)
|
381 |
+
subtitle, file_path = generate_file(
|
382 |
+
output_dir=self.output_dir,
|
383 |
+
output_file_name=file_name,
|
384 |
+
output_format=file_format,
|
385 |
+
result=transcribed_segments,
|
386 |
add_timestamp=add_timestamp,
|
387 |
+
**writer_options
|
|
|
388 |
)
|
389 |
+
|
390 |
result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
391 |
|
392 |
if os.path.exists(audio):
|
393 |
os.remove(audio)
|
394 |
|
395 |
+
return result_str, file_path
|
396 |
|
397 |
except Exception as e:
|
398 |
+
print(f"Error transcribing youtube: {e}")
|
399 |
+
raise
|
400 |
finally:
|
401 |
self.release_cuda_memory()
|
402 |
|
403 |
+
def get_compute_type(self):
|
404 |
+
if "float16" in self.available_compute_types:
|
405 |
+
return "float16"
|
406 |
+
if "float32" in self.available_compute_types:
|
407 |
+
return "float32"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
else:
|
409 |
+
return self.available_compute_types[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
|
411 |
+
def get_available_compute_type(self):
|
412 |
+
if self.device == "cuda":
|
413 |
+
return list(ctranslate2.get_supported_compute_types("cuda"))
|
414 |
+
else:
|
415 |
+
return list(ctranslate2.get_supported_compute_types("cpu"))
|
|
|
416 |
|
417 |
@staticmethod
|
418 |
def format_time(elapsed_time: float) -> str:
|
|
|
446 |
if torch.cuda.is_available():
|
447 |
return "cuda"
|
448 |
elif torch.backends.mps.is_available():
|
449 |
+
if not BaseTranscriptionPipeline.is_sparse_api_supported():
|
450 |
# Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
|
451 |
return "cpu"
|
452 |
return "mps"
|
|
|
487 |
if file_path and os.path.exists(file_path):
|
488 |
os.remove(file_path)
|
489 |
|
490 |
+
@staticmethod
|
491 |
+
def validate_gradio_values(params: TranscriptionPipelineParams):
|
492 |
+
"""
|
493 |
+
Validate gradio specific values that can't be displayed as None in the UI.
|
494 |
+
Related issue : https://github.com/gradio-app/gradio/issues/8723
|
495 |
+
"""
|
496 |
+
if params.whisper.lang is None:
|
497 |
+
pass
|
498 |
+
elif params.whisper.lang == AUTOMATIC_DETECTION:
|
499 |
+
params.whisper.lang = None
|
500 |
+
else:
|
501 |
+
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
502 |
+
params.whisper.lang = language_code_dict[params.whisper.lang]
|
503 |
+
|
504 |
+
if params.whisper.initial_prompt == GRADIO_NONE_STR:
|
505 |
+
params.whisper.initial_prompt = None
|
506 |
+
if params.whisper.prefix == GRADIO_NONE_STR:
|
507 |
+
params.whisper.prefix = None
|
508 |
+
if params.whisper.hotwords == GRADIO_NONE_STR:
|
509 |
+
params.whisper.hotwords = None
|
510 |
+
if params.whisper.max_new_tokens == GRADIO_NONE_NUMBER_MIN:
|
511 |
+
params.whisper.max_new_tokens = None
|
512 |
+
if params.whisper.hallucination_silence_threshold == GRADIO_NONE_NUMBER_MIN:
|
513 |
+
params.whisper.hallucination_silence_threshold = None
|
514 |
+
if params.whisper.language_detection_threshold == GRADIO_NONE_NUMBER_MIN:
|
515 |
+
params.whisper.language_detection_threshold = None
|
516 |
+
if params.vad.max_speech_duration_s == GRADIO_NONE_NUMBER_MAX:
|
517 |
+
params.vad.max_speech_duration_s = float('inf')
|
518 |
+
return params
|
519 |
+
|
520 |
@staticmethod
|
521 |
def cache_parameters(
|
522 |
+
params: TranscriptionPipelineParams,
|
523 |
+
file_format: str = "SRT",
|
524 |
+
add_timestamp: bool = True
|
525 |
):
|
526 |
+
"""Cache parameters to the yaml file"""
|
527 |
cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
528 |
+
param_to_cache = params.to_dict()
|
529 |
+
|
530 |
+
cached_yaml = {**cached_params, **param_to_cache}
|
531 |
cached_yaml["whisper"]["add_timestamp"] = add_timestamp
|
532 |
+
cached_yaml["whisper"]["file_format"] = file_format
|
533 |
+
|
534 |
+
supress_token = cached_yaml["whisper"].get("suppress_tokens", None)
|
535 |
+
if supress_token and isinstance(supress_token, list):
|
536 |
+
cached_yaml["whisper"]["suppress_tokens"] = str(supress_token)
|
537 |
+
|
538 |
+
if cached_yaml["whisper"].get("lang", None) is None:
|
539 |
+
cached_yaml["whisper"]["lang"] = AUTOMATIC_DETECTION.unwrap()
|
540 |
+
else:
|
541 |
+
language_dict = whisper.tokenizer.LANGUAGES
|
542 |
+
cached_yaml["whisper"]["lang"] = language_dict[cached_yaml["whisper"]["lang"]]
|
543 |
+
|
544 |
+
if cached_yaml["vad"].get("max_speech_duration_s", float('inf')) == float('inf'):
|
545 |
+
cached_yaml["vad"]["max_speech_duration_s"] = GRADIO_NONE_NUMBER_MAX
|
546 |
|
547 |
+
if cached_yaml is not None and cached_yaml:
|
548 |
+
save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
|
549 |
|
550 |
@staticmethod
|
551 |
def resample_audio(audio: Union[str, np.ndarray],
|
modules/whisper/data_classes.py
ADDED
@@ -0,0 +1,608 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import faster_whisper.transcribe
|
2 |
+
import gradio as gr
|
3 |
+
import torch
|
4 |
+
from typing import Optional, Dict, List, Union, NamedTuple
|
5 |
+
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
6 |
+
from gradio_i18n import Translate, gettext as _
|
7 |
+
from enum import Enum
|
8 |
+
from copy import deepcopy
|
9 |
+
|
10 |
+
import yaml
|
11 |
+
|
12 |
+
from modules.utils.constants import *
|
13 |
+
|
14 |
+
|
15 |
+
class WhisperImpl(Enum):
|
16 |
+
WHISPER = "whisper"
|
17 |
+
FASTER_WHISPER = "faster-whisper"
|
18 |
+
INSANELY_FAST_WHISPER = "insanely_fast_whisper"
|
19 |
+
|
20 |
+
|
21 |
+
class Segment(BaseModel):
|
22 |
+
id: Optional[int] = Field(default=None, description="Incremental id for the segment")
|
23 |
+
seek: Optional[int] = Field(default=None, description="Seek of the segment from chunked audio")
|
24 |
+
text: Optional[str] = Field(default=None, description="Transcription text of the segment")
|
25 |
+
start: Optional[float] = Field(default=None, description="Start time of the segment")
|
26 |
+
end: Optional[float] = Field(default=None, description="End time of the segment")
|
27 |
+
tokens: Optional[List[int]] = Field(default=None, description="List of token IDs")
|
28 |
+
temperature: Optional[float] = Field(default=None, description="Temperature used during the decoding process")
|
29 |
+
avg_logprob: Optional[float] = Field(default=None, description="Average log probability of the tokens")
|
30 |
+
compression_ratio: Optional[float] = Field(default=None, description="Compression ratio of the segment")
|
31 |
+
no_speech_prob: Optional[float] = Field(default=None, description="Probability that it's not speech")
|
32 |
+
words: Optional[List['Word']] = Field(default=None, description="List of words contained in the segment")
|
33 |
+
|
34 |
+
@classmethod
|
35 |
+
def from_faster_whisper(cls,
|
36 |
+
seg: faster_whisper.transcribe.Segment):
|
37 |
+
if seg.words is not None:
|
38 |
+
words = [
|
39 |
+
Word(
|
40 |
+
start=w.start,
|
41 |
+
end=w.end,
|
42 |
+
word=w.word,
|
43 |
+
probability=w.probability
|
44 |
+
) for w in seg.words
|
45 |
+
]
|
46 |
+
else:
|
47 |
+
words = None
|
48 |
+
|
49 |
+
return cls(
|
50 |
+
id=seg.id,
|
51 |
+
seek=seg.seek,
|
52 |
+
text=seg.text,
|
53 |
+
start=seg.start,
|
54 |
+
end=seg.end,
|
55 |
+
tokens=seg.tokens,
|
56 |
+
temperature=seg.temperature,
|
57 |
+
avg_logprob=seg.avg_logprob,
|
58 |
+
compression_ratio=seg.compression_ratio,
|
59 |
+
no_speech_prob=seg.no_speech_prob,
|
60 |
+
words=words
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
class Word(BaseModel):
|
65 |
+
start: Optional[float] = Field(default=None, description="Start time of the word")
|
66 |
+
end: Optional[float] = Field(default=None, description="Start time of the word")
|
67 |
+
word: Optional[str] = Field(default=None, description="Word text")
|
68 |
+
probability: Optional[float] = Field(default=None, description="Probability of the word")
|
69 |
+
|
70 |
+
|
71 |
+
class BaseParams(BaseModel):
|
72 |
+
model_config = ConfigDict(protected_namespaces=())
|
73 |
+
|
74 |
+
def to_dict(self) -> Dict:
|
75 |
+
return self.model_dump()
|
76 |
+
|
77 |
+
def to_list(self) -> List:
|
78 |
+
return list(self.model_dump().values())
|
79 |
+
|
80 |
+
@classmethod
|
81 |
+
def from_list(cls, data_list: List) -> 'BaseParams':
|
82 |
+
field_names = list(cls.model_fields.keys())
|
83 |
+
return cls(**dict(zip(field_names, data_list)))
|
84 |
+
|
85 |
+
|
86 |
+
class VadParams(BaseParams):
|
87 |
+
"""Voice Activity Detection parameters"""
|
88 |
+
vad_filter: bool = Field(default=False, description="Enable voice activity detection to filter out non-speech parts")
|
89 |
+
threshold: float = Field(
|
90 |
+
default=0.5,
|
91 |
+
ge=0.0,
|
92 |
+
le=1.0,
|
93 |
+
description="Speech threshold for Silero VAD. Probabilities above this value are considered speech"
|
94 |
+
)
|
95 |
+
min_speech_duration_ms: int = Field(
|
96 |
+
default=250,
|
97 |
+
ge=0,
|
98 |
+
description="Final speech chunks shorter than this are discarded"
|
99 |
+
)
|
100 |
+
max_speech_duration_s: float = Field(
|
101 |
+
default=float("inf"),
|
102 |
+
gt=0,
|
103 |
+
description="Maximum duration of speech chunks in seconds"
|
104 |
+
)
|
105 |
+
min_silence_duration_ms: int = Field(
|
106 |
+
default=2000,
|
107 |
+
ge=0,
|
108 |
+
description="Minimum silence duration between speech chunks"
|
109 |
+
)
|
110 |
+
speech_pad_ms: int = Field(
|
111 |
+
default=400,
|
112 |
+
ge=0,
|
113 |
+
description="Padding added to each side of speech chunks"
|
114 |
+
)
|
115 |
+
|
116 |
+
@classmethod
|
117 |
+
def to_gradio_inputs(cls, defaults: Optional[Dict] = None) -> List[gr.components.base.FormComponent]:
|
118 |
+
return [
|
119 |
+
gr.Checkbox(
|
120 |
+
label=_("Enable Silero VAD Filter"),
|
121 |
+
value=defaults.get("vad_filter", cls.__fields__["vad_filter"].default),
|
122 |
+
interactive=True,
|
123 |
+
info=_("Enable this to transcribe only detected voice")
|
124 |
+
),
|
125 |
+
gr.Slider(
|
126 |
+
minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
|
127 |
+
value=defaults.get("threshold", cls.__fields__["threshold"].default),
|
128 |
+
info="Lower it to be more sensitive to small sounds."
|
129 |
+
),
|
130 |
+
gr.Number(
|
131 |
+
label="Minimum Speech Duration (ms)", precision=0,
|
132 |
+
value=defaults.get("min_speech_duration_ms", cls.__fields__["min_speech_duration_ms"].default),
|
133 |
+
info="Final speech chunks shorter than this time are thrown out"
|
134 |
+
),
|
135 |
+
gr.Number(
|
136 |
+
label="Maximum Speech Duration (s)",
|
137 |
+
value=defaults.get("max_speech_duration_s", GRADIO_NONE_NUMBER_MAX),
|
138 |
+
info="Maximum duration of speech chunks in \"seconds\"."
|
139 |
+
),
|
140 |
+
gr.Number(
|
141 |
+
label="Minimum Silence Duration (ms)", precision=0,
|
142 |
+
value=defaults.get("min_silence_duration_ms", cls.__fields__["min_silence_duration_ms"].default),
|
143 |
+
info="In the end of each speech chunk wait for this time before separating it"
|
144 |
+
),
|
145 |
+
gr.Number(
|
146 |
+
label="Speech Padding (ms)", precision=0,
|
147 |
+
value=defaults.get("speech_pad_ms", cls.__fields__["speech_pad_ms"].default),
|
148 |
+
info="Final speech chunks are padded by this time each side"
|
149 |
+
)
|
150 |
+
]
|
151 |
+
|
152 |
+
|
153 |
+
class DiarizationParams(BaseParams):
|
154 |
+
"""Speaker diarization parameters"""
|
155 |
+
is_diarize: bool = Field(default=False, description="Enable speaker diarization")
|
156 |
+
device: str = Field(default="cuda", description="Device to run Diarization model.")
|
157 |
+
hf_token: str = Field(
|
158 |
+
default="",
|
159 |
+
description="Hugging Face token for downloading diarization models"
|
160 |
+
)
|
161 |
+
|
162 |
+
@classmethod
|
163 |
+
def to_gradio_inputs(cls,
|
164 |
+
defaults: Optional[Dict] = None,
|
165 |
+
available_devices: Optional[List] = None,
|
166 |
+
device: Optional[str] = None) -> List[gr.components.base.FormComponent]:
|
167 |
+
return [
|
168 |
+
gr.Checkbox(
|
169 |
+
label=_("Enable Diarization"),
|
170 |
+
value=defaults.get("is_diarize", cls.__fields__["is_diarize"].default),
|
171 |
+
),
|
172 |
+
gr.Dropdown(
|
173 |
+
label=_("Device"),
|
174 |
+
choices=["cpu", "cuda"] if available_devices is None else available_devices,
|
175 |
+
value=defaults.get("device", device),
|
176 |
+
),
|
177 |
+
gr.Textbox(
|
178 |
+
label=_("HuggingFace Token"),
|
179 |
+
value=defaults.get("hf_token", cls.__fields__["hf_token"].default),
|
180 |
+
info=_("This is only needed the first time you download the model")
|
181 |
+
),
|
182 |
+
]
|
183 |
+
|
184 |
+
|
185 |
+
class BGMSeparationParams(BaseParams):
|
186 |
+
"""Background music separation parameters"""
|
187 |
+
is_separate_bgm: bool = Field(default=False, description="Enable background music separation")
|
188 |
+
model_size: str = Field(
|
189 |
+
default="UVR-MDX-NET-Inst_HQ_4",
|
190 |
+
description="UVR model size"
|
191 |
+
)
|
192 |
+
device: str = Field(default="cuda", description="Device to run UVR model.")
|
193 |
+
segment_size: int = Field(
|
194 |
+
default=256,
|
195 |
+
gt=0,
|
196 |
+
description="Segment size for UVR model"
|
197 |
+
)
|
198 |
+
save_file: bool = Field(
|
199 |
+
default=False,
|
200 |
+
description="Whether to save separated audio files"
|
201 |
+
)
|
202 |
+
enable_offload: bool = Field(
|
203 |
+
default=True,
|
204 |
+
description="Offload UVR model after transcription"
|
205 |
+
)
|
206 |
+
|
207 |
+
@classmethod
|
208 |
+
def to_gradio_input(cls,
|
209 |
+
defaults: Optional[Dict] = None,
|
210 |
+
available_devices: Optional[List] = None,
|
211 |
+
device: Optional[str] = None,
|
212 |
+
available_models: Optional[List] = None) -> List[gr.components.base.FormComponent]:
|
213 |
+
return [
|
214 |
+
gr.Checkbox(
|
215 |
+
label=_("Enable Background Music Remover Filter"),
|
216 |
+
value=defaults.get("is_separate_bgm", cls.__fields__["is_separate_bgm"].default),
|
217 |
+
interactive=True,
|
218 |
+
info=_("Enabling this will remove background music")
|
219 |
+
),
|
220 |
+
gr.Dropdown(
|
221 |
+
label=_("Model"),
|
222 |
+
choices=["UVR-MDX-NET-Inst_HQ_4",
|
223 |
+
"UVR-MDX-NET-Inst_3"] if available_models is None else available_models,
|
224 |
+
value=defaults.get("model_size", cls.__fields__["model_size"].default),
|
225 |
+
),
|
226 |
+
gr.Dropdown(
|
227 |
+
label=_("Device"),
|
228 |
+
choices=["cpu", "cuda"] if available_devices is None else available_devices,
|
229 |
+
value=defaults.get("device", device),
|
230 |
+
),
|
231 |
+
gr.Number(
|
232 |
+
label="Segment Size",
|
233 |
+
value=defaults.get("segment_size", cls.__fields__["segment_size"].default),
|
234 |
+
precision=0,
|
235 |
+
info="Segment size for UVR model"
|
236 |
+
),
|
237 |
+
gr.Checkbox(
|
238 |
+
label=_("Save separated files to output"),
|
239 |
+
value=defaults.get("save_file", cls.__fields__["save_file"].default),
|
240 |
+
),
|
241 |
+
gr.Checkbox(
|
242 |
+
label=_("Offload sub model after removing background music"),
|
243 |
+
value=defaults.get("enable_offload", cls.__fields__["enable_offload"].default),
|
244 |
+
)
|
245 |
+
]
|
246 |
+
|
247 |
+
|
248 |
+
class WhisperParams(BaseParams):
|
249 |
+
"""Whisper parameters"""
|
250 |
+
model_size: str = Field(default="large-v2", description="Whisper model size")
|
251 |
+
lang: Optional[str] = Field(default=None, description="Source language of the file to transcribe")
|
252 |
+
is_translate: bool = Field(default=False, description="Translate speech to English end-to-end")
|
253 |
+
beam_size: int = Field(default=5, ge=1, description="Beam size for decoding")
|
254 |
+
log_prob_threshold: float = Field(
|
255 |
+
default=-1.0,
|
256 |
+
description="Threshold for average log probability of sampled tokens"
|
257 |
+
)
|
258 |
+
no_speech_threshold: float = Field(
|
259 |
+
default=0.6,
|
260 |
+
ge=0.0,
|
261 |
+
le=1.0,
|
262 |
+
description="Threshold for detecting silence"
|
263 |
+
)
|
264 |
+
compute_type: str = Field(default="float16", description="Computation type for transcription")
|
265 |
+
best_of: int = Field(default=5, ge=1, description="Number of candidates when sampling")
|
266 |
+
patience: float = Field(default=1.0, gt=0, description="Beam search patience factor")
|
267 |
+
condition_on_previous_text: bool = Field(
|
268 |
+
default=True,
|
269 |
+
description="Use previous output as prompt for next window"
|
270 |
+
)
|
271 |
+
prompt_reset_on_temperature: float = Field(
|
272 |
+
default=0.5,
|
273 |
+
ge=0.0,
|
274 |
+
le=1.0,
|
275 |
+
description="Temperature threshold for resetting prompt"
|
276 |
+
)
|
277 |
+
initial_prompt: Optional[str] = Field(default=None, description="Initial prompt for first window")
|
278 |
+
temperature: float = Field(
|
279 |
+
default=0.0,
|
280 |
+
ge=0.0,
|
281 |
+
description="Temperature for sampling"
|
282 |
+
)
|
283 |
+
compression_ratio_threshold: float = Field(
|
284 |
+
default=2.4,
|
285 |
+
gt=0,
|
286 |
+
description="Threshold for gzip compression ratio"
|
287 |
+
)
|
288 |
+
length_penalty: float = Field(default=1.0, gt=0, description="Exponential length penalty")
|
289 |
+
repetition_penalty: float = Field(default=1.0, gt=0, description="Penalty for repeated tokens")
|
290 |
+
no_repeat_ngram_size: int = Field(default=0, ge=0, description="Size of n-grams to prevent repetition")
|
291 |
+
prefix: Optional[str] = Field(default=None, description="Prefix text for first window")
|
292 |
+
suppress_blank: bool = Field(
|
293 |
+
default=True,
|
294 |
+
description="Suppress blank outputs at start of sampling"
|
295 |
+
)
|
296 |
+
suppress_tokens: Optional[Union[List[int], str]] = Field(default=[-1], description="Token IDs to suppress")
|
297 |
+
max_initial_timestamp: float = Field(
|
298 |
+
default=1.0,
|
299 |
+
ge=0.0,
|
300 |
+
description="Maximum initial timestamp"
|
301 |
+
)
|
302 |
+
word_timestamps: bool = Field(default=False, description="Extract word-level timestamps")
|
303 |
+
prepend_punctuations: Optional[str] = Field(
|
304 |
+
default="\"'“¿([{-",
|
305 |
+
description="Punctuations to merge with next word"
|
306 |
+
)
|
307 |
+
append_punctuations: Optional[str] = Field(
|
308 |
+
default="\"'.。,,!!??::”)]}、",
|
309 |
+
description="Punctuations to merge with previous word"
|
310 |
+
)
|
311 |
+
max_new_tokens: Optional[int] = Field(default=None, description="Maximum number of new tokens per chunk")
|
312 |
+
chunk_length: Optional[int] = Field(default=30, description="Length of audio segments in seconds")
|
313 |
+
hallucination_silence_threshold: Optional[float] = Field(
|
314 |
+
default=None,
|
315 |
+
description="Threshold for skipping silent periods in hallucination detection"
|
316 |
+
)
|
317 |
+
hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
|
318 |
+
language_detection_threshold: Optional[float] = Field(
|
319 |
+
default=None,
|
320 |
+
description="Threshold for language detection probability"
|
321 |
+
)
|
322 |
+
language_detection_segments: int = Field(
|
323 |
+
default=1,
|
324 |
+
gt=0,
|
325 |
+
description="Number of segments for language detection"
|
326 |
+
)
|
327 |
+
batch_size: int = Field(default=24, gt=0, description="Batch size for processing")
|
328 |
+
|
329 |
+
@field_validator('lang')
|
330 |
+
def validate_lang(cls, v):
|
331 |
+
from modules.utils.constants import AUTOMATIC_DETECTION
|
332 |
+
return None if v == AUTOMATIC_DETECTION.unwrap() else v
|
333 |
+
|
334 |
+
@field_validator('suppress_tokens')
|
335 |
+
def validate_supress_tokens(cls, v):
|
336 |
+
import ast
|
337 |
+
try:
|
338 |
+
if isinstance(v, str):
|
339 |
+
suppress_tokens = ast.literal_eval(v)
|
340 |
+
if not isinstance(suppress_tokens, list):
|
341 |
+
raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
|
342 |
+
return suppress_tokens
|
343 |
+
if isinstance(v, list):
|
344 |
+
return v
|
345 |
+
except Exception as e:
|
346 |
+
raise ValueError(f"Invalid Suppress Tokens. The value must be type of List[int]: {e}")
|
347 |
+
|
348 |
+
@classmethod
|
349 |
+
def to_gradio_inputs(cls,
|
350 |
+
defaults: Optional[Dict] = None,
|
351 |
+
only_advanced: Optional[bool] = True,
|
352 |
+
whisper_type: Optional[str] = None,
|
353 |
+
available_models: Optional[List] = None,
|
354 |
+
available_langs: Optional[List] = None,
|
355 |
+
available_compute_types: Optional[List] = None,
|
356 |
+
compute_type: Optional[str] = None):
|
357 |
+
whisper_type = WhisperImpl.FASTER_WHISPER.value if whisper_type is None else whisper_type.strip().lower()
|
358 |
+
|
359 |
+
inputs = []
|
360 |
+
if not only_advanced:
|
361 |
+
inputs += [
|
362 |
+
gr.Dropdown(
|
363 |
+
label=_("Model"),
|
364 |
+
choices=available_models,
|
365 |
+
value=defaults.get("model_size", cls.__fields__["model_size"].default),
|
366 |
+
),
|
367 |
+
gr.Dropdown(
|
368 |
+
label=_("Language"),
|
369 |
+
choices=available_langs,
|
370 |
+
value=defaults.get("lang", AUTOMATIC_DETECTION),
|
371 |
+
),
|
372 |
+
gr.Checkbox(
|
373 |
+
label=_("Translate to English?"),
|
374 |
+
value=defaults.get("is_translate", cls.__fields__["is_translate"].default),
|
375 |
+
),
|
376 |
+
]
|
377 |
+
|
378 |
+
inputs += [
|
379 |
+
gr.Number(
|
380 |
+
label="Beam Size",
|
381 |
+
value=defaults.get("beam_size", cls.__fields__["beam_size"].default),
|
382 |
+
precision=0,
|
383 |
+
info="Beam size for decoding"
|
384 |
+
),
|
385 |
+
gr.Number(
|
386 |
+
label="Log Probability Threshold",
|
387 |
+
value=defaults.get("log_prob_threshold", cls.__fields__["log_prob_threshold"].default),
|
388 |
+
info="Threshold for average log probability of sampled tokens"
|
389 |
+
),
|
390 |
+
gr.Number(
|
391 |
+
label="No Speech Threshold",
|
392 |
+
value=defaults.get("no_speech_threshold", cls.__fields__["no_speech_threshold"].default),
|
393 |
+
info="Threshold for detecting silence"
|
394 |
+
),
|
395 |
+
gr.Dropdown(
|
396 |
+
label="Compute Type",
|
397 |
+
choices=["float16", "int8", "int16"] if available_compute_types is None else available_compute_types,
|
398 |
+
value=defaults.get("compute_type", compute_type),
|
399 |
+
info="Computation type for transcription"
|
400 |
+
),
|
401 |
+
gr.Number(
|
402 |
+
label="Best Of",
|
403 |
+
value=defaults.get("best_of", cls.__fields__["best_of"].default),
|
404 |
+
precision=0,
|
405 |
+
info="Number of candidates when sampling"
|
406 |
+
),
|
407 |
+
gr.Number(
|
408 |
+
label="Patience",
|
409 |
+
value=defaults.get("patience", cls.__fields__["patience"].default),
|
410 |
+
info="Beam search patience factor"
|
411 |
+
),
|
412 |
+
gr.Checkbox(
|
413 |
+
label="Condition On Previous Text",
|
414 |
+
value=defaults.get("condition_on_previous_text", cls.__fields__["condition_on_previous_text"].default),
|
415 |
+
info="Use previous output as prompt for next window"
|
416 |
+
),
|
417 |
+
gr.Slider(
|
418 |
+
label="Prompt Reset On Temperature",
|
419 |
+
value=defaults.get("prompt_reset_on_temperature",
|
420 |
+
cls.__fields__["prompt_reset_on_temperature"].default),
|
421 |
+
minimum=0,
|
422 |
+
maximum=1,
|
423 |
+
step=0.01,
|
424 |
+
info="Temperature threshold for resetting prompt"
|
425 |
+
),
|
426 |
+
gr.Textbox(
|
427 |
+
label="Initial Prompt",
|
428 |
+
value=defaults.get("initial_prompt", GRADIO_NONE_STR),
|
429 |
+
info="Initial prompt for first window"
|
430 |
+
),
|
431 |
+
gr.Slider(
|
432 |
+
label="Temperature",
|
433 |
+
value=defaults.get("temperature", cls.__fields__["temperature"].default),
|
434 |
+
minimum=0.0,
|
435 |
+
step=0.01,
|
436 |
+
maximum=1.0,
|
437 |
+
info="Temperature for sampling"
|
438 |
+
),
|
439 |
+
gr.Number(
|
440 |
+
label="Compression Ratio Threshold",
|
441 |
+
value=defaults.get("compression_ratio_threshold",
|
442 |
+
cls.__fields__["compression_ratio_threshold"].default),
|
443 |
+
info="Threshold for gzip compression ratio"
|
444 |
+
)
|
445 |
+
]
|
446 |
+
|
447 |
+
faster_whisper_inputs = [
|
448 |
+
gr.Number(
|
449 |
+
label="Length Penalty",
|
450 |
+
value=defaults.get("length_penalty", cls.__fields__["length_penalty"].default),
|
451 |
+
info="Exponential length penalty",
|
452 |
+
),
|
453 |
+
gr.Number(
|
454 |
+
label="Repetition Penalty",
|
455 |
+
value=defaults.get("repetition_penalty", cls.__fields__["repetition_penalty"].default),
|
456 |
+
info="Penalty for repeated tokens"
|
457 |
+
),
|
458 |
+
gr.Number(
|
459 |
+
label="No Repeat N-gram Size",
|
460 |
+
value=defaults.get("no_repeat_ngram_size", cls.__fields__["no_repeat_ngram_size"].default),
|
461 |
+
precision=0,
|
462 |
+
info="Size of n-grams to prevent repetition"
|
463 |
+
),
|
464 |
+
gr.Textbox(
|
465 |
+
label="Prefix",
|
466 |
+
value=defaults.get("prefix", GRADIO_NONE_STR),
|
467 |
+
info="Prefix text for first window"
|
468 |
+
),
|
469 |
+
gr.Checkbox(
|
470 |
+
label="Suppress Blank",
|
471 |
+
value=defaults.get("suppress_blank", cls.__fields__["suppress_blank"].default),
|
472 |
+
info="Suppress blank outputs at start of sampling"
|
473 |
+
),
|
474 |
+
gr.Textbox(
|
475 |
+
label="Suppress Tokens",
|
476 |
+
value=defaults.get("suppress_tokens", "[-1]"),
|
477 |
+
info="Token IDs to suppress"
|
478 |
+
),
|
479 |
+
gr.Number(
|
480 |
+
label="Max Initial Timestamp",
|
481 |
+
value=defaults.get("max_initial_timestamp", cls.__fields__["max_initial_timestamp"].default),
|
482 |
+
info="Maximum initial timestamp"
|
483 |
+
),
|
484 |
+
gr.Checkbox(
|
485 |
+
label="Word Timestamps",
|
486 |
+
value=defaults.get("word_timestamps", cls.__fields__["word_timestamps"].default),
|
487 |
+
info="Extract word-level timestamps"
|
488 |
+
),
|
489 |
+
gr.Textbox(
|
490 |
+
label="Prepend Punctuations",
|
491 |
+
value=defaults.get("prepend_punctuations", cls.__fields__["prepend_punctuations"].default),
|
492 |
+
info="Punctuations to merge with next word"
|
493 |
+
),
|
494 |
+
gr.Textbox(
|
495 |
+
label="Append Punctuations",
|
496 |
+
value=defaults.get("append_punctuations", cls.__fields__["append_punctuations"].default),
|
497 |
+
info="Punctuations to merge with previous word"
|
498 |
+
),
|
499 |
+
gr.Number(
|
500 |
+
label="Max New Tokens",
|
501 |
+
value=defaults.get("max_new_tokens", GRADIO_NONE_NUMBER_MIN),
|
502 |
+
precision=0,
|
503 |
+
info="Maximum number of new tokens per chunk"
|
504 |
+
),
|
505 |
+
gr.Number(
|
506 |
+
label="Chunk Length (s)",
|
507 |
+
value=defaults.get("chunk_length", cls.__fields__["chunk_length"].default),
|
508 |
+
precision=0,
|
509 |
+
info="Length of audio segments in seconds"
|
510 |
+
),
|
511 |
+
gr.Number(
|
512 |
+
label="Hallucination Silence Threshold (sec)",
|
513 |
+
value=defaults.get("hallucination_silence_threshold",
|
514 |
+
GRADIO_NONE_NUMBER_MIN),
|
515 |
+
info="Threshold for skipping silent periods in hallucination detection"
|
516 |
+
),
|
517 |
+
gr.Textbox(
|
518 |
+
label="Hotwords",
|
519 |
+
value=defaults.get("hotwords", cls.__fields__["hotwords"].default),
|
520 |
+
info="Hotwords/hint phrases for the model"
|
521 |
+
),
|
522 |
+
gr.Number(
|
523 |
+
label="Language Detection Threshold",
|
524 |
+
value=defaults.get("language_detection_threshold",
|
525 |
+
GRADIO_NONE_NUMBER_MIN),
|
526 |
+
info="Threshold for language detection probability"
|
527 |
+
),
|
528 |
+
gr.Number(
|
529 |
+
label="Language Detection Segments",
|
530 |
+
value=defaults.get("language_detection_segments",
|
531 |
+
cls.__fields__["language_detection_segments"].default),
|
532 |
+
precision=0,
|
533 |
+
info="Number of segments for language detection"
|
534 |
+
)
|
535 |
+
]
|
536 |
+
|
537 |
+
insanely_fast_whisper_inputs = [
|
538 |
+
gr.Number(
|
539 |
+
label="Batch Size",
|
540 |
+
value=defaults.get("batch_size", cls.__fields__["batch_size"].default),
|
541 |
+
precision=0,
|
542 |
+
info="Batch size for processing"
|
543 |
+
)
|
544 |
+
]
|
545 |
+
|
546 |
+
if whisper_type != WhisperImpl.FASTER_WHISPER.value:
|
547 |
+
for input_component in faster_whisper_inputs:
|
548 |
+
input_component.visible = False
|
549 |
+
|
550 |
+
if whisper_type != WhisperImpl.INSANELY_FAST_WHISPER.value:
|
551 |
+
for input_component in insanely_fast_whisper_inputs:
|
552 |
+
input_component.visible = False
|
553 |
+
|
554 |
+
inputs += faster_whisper_inputs + insanely_fast_whisper_inputs
|
555 |
+
|
556 |
+
return inputs
|
557 |
+
|
558 |
+
|
559 |
+
class TranscriptionPipelineParams(BaseModel):
|
560 |
+
"""Transcription pipeline parameters"""
|
561 |
+
whisper: WhisperParams = Field(default_factory=WhisperParams)
|
562 |
+
vad: VadParams = Field(default_factory=VadParams)
|
563 |
+
diarization: DiarizationParams = Field(default_factory=DiarizationParams)
|
564 |
+
bgm_separation: BGMSeparationParams = Field(default_factory=BGMSeparationParams)
|
565 |
+
|
566 |
+
def to_dict(self) -> Dict:
|
567 |
+
data = {
|
568 |
+
"whisper": self.whisper.to_dict(),
|
569 |
+
"vad": self.vad.to_dict(),
|
570 |
+
"diarization": self.diarization.to_dict(),
|
571 |
+
"bgm_separation": self.bgm_separation.to_dict()
|
572 |
+
}
|
573 |
+
return data
|
574 |
+
|
575 |
+
def to_list(self) -> List:
|
576 |
+
"""
|
577 |
+
Convert data class to the list because I have to pass the parameters as a list in the gradio.
|
578 |
+
Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
|
579 |
+
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
580 |
+
"""
|
581 |
+
whisper_list = self.whisper.to_list()
|
582 |
+
vad_list = self.vad.to_list()
|
583 |
+
diarization_list = self.diarization.to_list()
|
584 |
+
bgm_sep_list = self.bgm_separation.to_list()
|
585 |
+
return whisper_list + vad_list + diarization_list + bgm_sep_list
|
586 |
+
|
587 |
+
@staticmethod
|
588 |
+
def from_list(pipeline_list: List) -> 'TranscriptionPipelineParams':
|
589 |
+
"""Convert list to the data class again to use it in a function."""
|
590 |
+
data_list = deepcopy(pipeline_list)
|
591 |
+
|
592 |
+
whisper_list = data_list[0:len(WhisperParams.__annotations__)]
|
593 |
+
data_list = data_list[len(WhisperParams.__annotations__):]
|
594 |
+
|
595 |
+
vad_list = data_list[0:len(VadParams.__annotations__)]
|
596 |
+
data_list = data_list[len(VadParams.__annotations__):]
|
597 |
+
|
598 |
+
diarization_list = data_list[0:len(DiarizationParams.__annotations__)]
|
599 |
+
data_list = data_list[len(DiarizationParams.__annotations__):]
|
600 |
+
|
601 |
+
bgm_sep_list = data_list[0:len(BGMSeparationParams.__annotations__)]
|
602 |
+
|
603 |
+
return TranscriptionPipelineParams(
|
604 |
+
whisper=WhisperParams.from_list(whisper_list),
|
605 |
+
vad=VadParams.from_list(vad_list),
|
606 |
+
diarization=DiarizationParams.from_list(diarization_list),
|
607 |
+
bgm_separation=BGMSeparationParams.from_list(bgm_sep_list)
|
608 |
+
)
|
modules/whisper/faster_whisper_inference.py
CHANGED
@@ -12,11 +12,11 @@ import gradio as gr
|
|
12 |
from argparse import Namespace
|
13 |
|
14 |
from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
|
15 |
-
from modules.whisper.
|
16 |
-
from modules.whisper.
|
17 |
|
18 |
|
19 |
-
class FasterWhisperInference(
|
20 |
def __init__(self,
|
21 |
model_dir: str = FASTER_WHISPER_MODELS_DIR,
|
22 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
@@ -35,14 +35,12 @@ class FasterWhisperInference(WhisperBase):
|
|
35 |
self.model_paths = self.get_model_paths()
|
36 |
self.device = self.get_device()
|
37 |
self.available_models = self.model_paths.keys()
|
38 |
-
self.available_compute_types = ctranslate2.get_supported_compute_types(
|
39 |
-
"cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
|
40 |
|
41 |
def transcribe(self,
|
42 |
audio: Union[str, BinaryIO, np.ndarray],
|
43 |
progress: gr.Progress = gr.Progress(),
|
44 |
*whisper_params,
|
45 |
-
) -> Tuple[List[
|
46 |
"""
|
47 |
transcribe method for faster-whisper.
|
48 |
|
@@ -57,28 +55,18 @@ class FasterWhisperInference(WhisperBase):
|
|
57 |
|
58 |
Returns
|
59 |
----------
|
60 |
-
segments_result: List[
|
61 |
-
list of
|
62 |
elapsed_time: float
|
63 |
elapsed time for transcription
|
64 |
"""
|
65 |
start_time = time.time()
|
66 |
|
67 |
-
params =
|
68 |
|
69 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
70 |
self.update_model(params.model_size, params.compute_type, progress)
|
71 |
|
72 |
-
# None parameters with Textboxes: https://github.com/gradio-app/gradio/issues/8723
|
73 |
-
if not params.initial_prompt:
|
74 |
-
params.initial_prompt = None
|
75 |
-
if not params.prefix:
|
76 |
-
params.prefix = None
|
77 |
-
if not params.hotwords:
|
78 |
-
params.hotwords = None
|
79 |
-
|
80 |
-
params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
|
81 |
-
|
82 |
segments, info = self.model.transcribe(
|
83 |
audio=audio,
|
84 |
language=params.lang,
|
@@ -114,11 +102,7 @@ class FasterWhisperInference(WhisperBase):
|
|
114 |
segments_result = []
|
115 |
for segment in segments:
|
116 |
progress(segment.start / info.duration, desc="Transcribing..")
|
117 |
-
segments_result.append(
|
118 |
-
"start": segment.start,
|
119 |
-
"end": segment.end,
|
120 |
-
"text": segment.text
|
121 |
-
})
|
122 |
|
123 |
elapsed_time = time.time() - start_time
|
124 |
return segments_result, elapsed_time
|
|
|
12 |
from argparse import Namespace
|
13 |
|
14 |
from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
|
15 |
+
from modules.whisper.data_classes import *
|
16 |
+
from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
|
17 |
|
18 |
|
19 |
+
class FasterWhisperInference(BaseTranscriptionPipeline):
|
20 |
def __init__(self,
|
21 |
model_dir: str = FASTER_WHISPER_MODELS_DIR,
|
22 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
|
|
35 |
self.model_paths = self.get_model_paths()
|
36 |
self.device = self.get_device()
|
37 |
self.available_models = self.model_paths.keys()
|
|
|
|
|
38 |
|
39 |
def transcribe(self,
|
40 |
audio: Union[str, BinaryIO, np.ndarray],
|
41 |
progress: gr.Progress = gr.Progress(),
|
42 |
*whisper_params,
|
43 |
+
) -> Tuple[List[Segment], float]:
|
44 |
"""
|
45 |
transcribe method for faster-whisper.
|
46 |
|
|
|
55 |
|
56 |
Returns
|
57 |
----------
|
58 |
+
segments_result: List[Segment]
|
59 |
+
list of Segment that includes start, end timestamps and transcribed text
|
60 |
elapsed_time: float
|
61 |
elapsed time for transcription
|
62 |
"""
|
63 |
start_time = time.time()
|
64 |
|
65 |
+
params = WhisperParams.from_list(list(whisper_params))
|
66 |
|
67 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
68 |
self.update_model(params.model_size, params.compute_type, progress)
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
segments, info = self.model.transcribe(
|
71 |
audio=audio,
|
72 |
language=params.lang,
|
|
|
102 |
segments_result = []
|
103 |
for segment in segments:
|
104 |
progress(segment.start / info.duration, desc="Transcribing..")
|
105 |
+
segments_result.append(Segment.from_faster_whisper(segment))
|
|
|
|
|
|
|
|
|
106 |
|
107 |
elapsed_time = time.time() - start_time
|
108 |
return segments_result, elapsed_time
|
modules/whisper/insanely_fast_whisper_inference.py
CHANGED
@@ -12,11 +12,11 @@ from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
|
|
12 |
from argparse import Namespace
|
13 |
|
14 |
from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
|
15 |
-
from modules.whisper.
|
16 |
-
from modules.whisper.
|
17 |
|
18 |
|
19 |
-
class InsanelyFastWhisperInference(
|
20 |
def __init__(self,
|
21 |
model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
|
22 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
@@ -32,16 +32,13 @@ class InsanelyFastWhisperInference(WhisperBase):
|
|
32 |
self.model_dir = model_dir
|
33 |
os.makedirs(self.model_dir, exist_ok=True)
|
34 |
|
35 |
-
|
36 |
-
distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
|
37 |
-
self.available_models = openai_models + distil_models
|
38 |
-
self.available_compute_types = ["float16"]
|
39 |
|
40 |
def transcribe(self,
|
41 |
audio: Union[str, np.ndarray, torch.Tensor],
|
42 |
progress: gr.Progress = gr.Progress(),
|
43 |
*whisper_params,
|
44 |
-
) -> Tuple[List[
|
45 |
"""
|
46 |
transcribe method for faster-whisper.
|
47 |
|
@@ -56,13 +53,13 @@ class InsanelyFastWhisperInference(WhisperBase):
|
|
56 |
|
57 |
Returns
|
58 |
----------
|
59 |
-
segments_result: List[
|
60 |
-
list of
|
61 |
elapsed_time: float
|
62 |
elapsed time for transcription
|
63 |
"""
|
64 |
start_time = time.time()
|
65 |
-
params =
|
66 |
|
67 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
68 |
self.update_model(params.model_size, params.compute_type, progress)
|
@@ -96,9 +93,17 @@ class InsanelyFastWhisperInference(WhisperBase):
|
|
96 |
generate_kwargs=kwargs
|
97 |
)
|
98 |
|
99 |
-
segments_result =
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
elapsed_time = time.time() - start_time
|
103 |
return segments_result, elapsed_time
|
104 |
|
@@ -139,31 +144,26 @@ class InsanelyFastWhisperInference(WhisperBase):
|
|
139 |
model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
|
140 |
)
|
141 |
|
142 |
-
|
143 |
-
def format_result(
|
144 |
-
transcribed_result: dict
|
145 |
-
) -> List[dict]:
|
146 |
"""
|
147 |
-
|
148 |
-
|
149 |
-
Parameters
|
150 |
-
----------
|
151 |
-
transcribed_result: dict
|
152 |
-
Transcription result of the insanely_fast_whisper
|
153 |
|
154 |
Returns
|
155 |
----------
|
156 |
-
|
157 |
-
Formatted result as the same with other implementation
|
158 |
"""
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
167 |
|
168 |
@staticmethod
|
169 |
def download_model(
|
|
|
12 |
from argparse import Namespace
|
13 |
|
14 |
from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
|
15 |
+
from modules.whisper.data_classes import *
|
16 |
+
from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
|
17 |
|
18 |
|
19 |
+
class InsanelyFastWhisperInference(BaseTranscriptionPipeline):
|
20 |
def __init__(self,
|
21 |
model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
|
22 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
|
|
32 |
self.model_dir = model_dir
|
33 |
os.makedirs(self.model_dir, exist_ok=True)
|
34 |
|
35 |
+
self.available_models = self.get_model_paths()
|
|
|
|
|
|
|
36 |
|
37 |
def transcribe(self,
|
38 |
audio: Union[str, np.ndarray, torch.Tensor],
|
39 |
progress: gr.Progress = gr.Progress(),
|
40 |
*whisper_params,
|
41 |
+
) -> Tuple[List[Segment], float]:
|
42 |
"""
|
43 |
transcribe method for faster-whisper.
|
44 |
|
|
|
53 |
|
54 |
Returns
|
55 |
----------
|
56 |
+
segments_result: List[Segment]
|
57 |
+
list of Segment that includes start, end timestamps and transcribed text
|
58 |
elapsed_time: float
|
59 |
elapsed time for transcription
|
60 |
"""
|
61 |
start_time = time.time()
|
62 |
+
params = WhisperParams.from_list(list(whisper_params))
|
63 |
|
64 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
65 |
self.update_model(params.model_size, params.compute_type, progress)
|
|
|
93 |
generate_kwargs=kwargs
|
94 |
)
|
95 |
|
96 |
+
segments_result = []
|
97 |
+
for item in segments["chunks"]:
|
98 |
+
start, end = item["timestamp"][0], item["timestamp"][1]
|
99 |
+
if end is None:
|
100 |
+
end = start
|
101 |
+
segments_result.append(Segment(
|
102 |
+
text=item["text"],
|
103 |
+
start=start,
|
104 |
+
end=end
|
105 |
+
))
|
106 |
+
|
107 |
elapsed_time = time.time() - start_time
|
108 |
return segments_result, elapsed_time
|
109 |
|
|
|
144 |
model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
|
145 |
)
|
146 |
|
147 |
+
def get_model_paths(self):
|
|
|
|
|
|
|
148 |
"""
|
149 |
+
Get available models from models path including fine-tuned model.
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
Returns
|
152 |
----------
|
153 |
+
Name set of models
|
|
|
154 |
"""
|
155 |
+
openai_models = whisper.available_models()
|
156 |
+
distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
|
157 |
+
default_models = openai_models + distil_models
|
158 |
+
|
159 |
+
existing_models = os.listdir(self.model_dir)
|
160 |
+
wrong_dirs = [".locks"]
|
161 |
+
|
162 |
+
available_models = default_models + existing_models
|
163 |
+
available_models = [model for model in available_models if model not in wrong_dirs]
|
164 |
+
available_models = sorted(set(available_models), key=available_models.index)
|
165 |
+
|
166 |
+
return available_models
|
167 |
|
168 |
@staticmethod
|
169 |
def download_model(
|
modules/whisper/whisper_Inference.py
CHANGED
@@ -8,11 +8,11 @@ import os
|
|
8 |
from argparse import Namespace
|
9 |
|
10 |
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
|
11 |
-
from modules.whisper.
|
12 |
-
from modules.whisper.
|
13 |
|
14 |
|
15 |
-
class WhisperInference(
|
16 |
def __init__(self,
|
17 |
model_dir: str = WHISPER_MODELS_DIR,
|
18 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
@@ -30,7 +30,7 @@ class WhisperInference(WhisperBase):
|
|
30 |
audio: Union[str, np.ndarray, torch.Tensor],
|
31 |
progress: gr.Progress = gr.Progress(),
|
32 |
*whisper_params,
|
33 |
-
) -> Tuple[List[
|
34 |
"""
|
35 |
transcribe method for faster-whisper.
|
36 |
|
@@ -45,13 +45,13 @@ class WhisperInference(WhisperBase):
|
|
45 |
|
46 |
Returns
|
47 |
----------
|
48 |
-
segments_result: List[
|
49 |
-
list of
|
50 |
elapsed_time: float
|
51 |
elapsed time for transcription
|
52 |
"""
|
53 |
start_time = time.time()
|
54 |
-
params =
|
55 |
|
56 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
57 |
self.update_model(params.model_size, params.compute_type, progress)
|
@@ -59,21 +59,28 @@ class WhisperInference(WhisperBase):
|
|
59 |
def progress_callback(progress_value):
|
60 |
progress(progress_value, desc="Transcribing..")
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
|
|
77 |
return segments_result, elapsed_time
|
78 |
|
79 |
def update_model(self,
|
|
|
8 |
from argparse import Namespace
|
9 |
|
10 |
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
|
11 |
+
from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
|
12 |
+
from modules.whisper.data_classes import *
|
13 |
|
14 |
|
15 |
+
class WhisperInference(BaseTranscriptionPipeline):
|
16 |
def __init__(self,
|
17 |
model_dir: str = WHISPER_MODELS_DIR,
|
18 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
|
|
30 |
audio: Union[str, np.ndarray, torch.Tensor],
|
31 |
progress: gr.Progress = gr.Progress(),
|
32 |
*whisper_params,
|
33 |
+
) -> Tuple[List[Segment], float]:
|
34 |
"""
|
35 |
transcribe method for faster-whisper.
|
36 |
|
|
|
45 |
|
46 |
Returns
|
47 |
----------
|
48 |
+
segments_result: List[Segment]
|
49 |
+
list of Segment that includes start, end timestamps and transcribed text
|
50 |
elapsed_time: float
|
51 |
elapsed time for transcription
|
52 |
"""
|
53 |
start_time = time.time()
|
54 |
+
params = WhisperParams.from_list(list(whisper_params))
|
55 |
|
56 |
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
57 |
self.update_model(params.model_size, params.compute_type, progress)
|
|
|
59 |
def progress_callback(progress_value):
|
60 |
progress(progress_value, desc="Transcribing..")
|
61 |
|
62 |
+
result = self.model.transcribe(audio=audio,
|
63 |
+
language=params.lang,
|
64 |
+
verbose=False,
|
65 |
+
beam_size=params.beam_size,
|
66 |
+
logprob_threshold=params.log_prob_threshold,
|
67 |
+
no_speech_threshold=params.no_speech_threshold,
|
68 |
+
task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
|
69 |
+
fp16=True if params.compute_type == "float16" else False,
|
70 |
+
best_of=params.best_of,
|
71 |
+
patience=params.patience,
|
72 |
+
temperature=params.temperature,
|
73 |
+
compression_ratio_threshold=params.compression_ratio_threshold,
|
74 |
+
progress_callback=progress_callback,)["segments"]
|
75 |
+
segments_result = []
|
76 |
+
for segment in result:
|
77 |
+
segments_result.append(Segment(
|
78 |
+
start=segment["start"],
|
79 |
+
end=segment["end"],
|
80 |
+
text=segment["text"]
|
81 |
+
))
|
82 |
|
83 |
+
elapsed_time = time.time() - start_time
|
84 |
return segments_result, elapsed_time
|
85 |
|
86 |
def update_model(self,
|
modules/whisper/whisper_factory.py
CHANGED
@@ -6,7 +6,8 @@ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_D
|
|
6 |
from modules.whisper.faster_whisper_inference import FasterWhisperInference
|
7 |
from modules.whisper.whisper_Inference import WhisperInference
|
8 |
from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
|
9 |
-
from modules.whisper.
|
|
|
10 |
|
11 |
|
12 |
class WhisperFactory:
|
@@ -19,7 +20,7 @@ class WhisperFactory:
|
|
19 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
20 |
uvr_model_dir: str = UVR_MODELS_DIR,
|
21 |
output_dir: str = OUTPUT_DIR,
|
22 |
-
) -> "
|
23 |
"""
|
24 |
Create a whisper inference class based on the provided whisper_type.
|
25 |
|
@@ -45,36 +46,29 @@ class WhisperFactory:
|
|
45 |
|
46 |
Returns
|
47 |
-------
|
48 |
-
|
49 |
An instance of the appropriate whisper inference class based on the whisper_type.
|
50 |
"""
|
51 |
# Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
|
52 |
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
53 |
|
54 |
-
whisper_type = whisper_type.
|
55 |
|
56 |
-
|
57 |
-
whisper_typos = ["whisper"]
|
58 |
-
insanely_fast_whisper_typos = [
|
59 |
-
"insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
|
60 |
-
"insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"
|
61 |
-
]
|
62 |
-
|
63 |
-
if whisper_type in faster_whisper_typos:
|
64 |
return FasterWhisperInference(
|
65 |
model_dir=faster_whisper_model_dir,
|
66 |
output_dir=output_dir,
|
67 |
diarization_model_dir=diarization_model_dir,
|
68 |
uvr_model_dir=uvr_model_dir
|
69 |
)
|
70 |
-
elif whisper_type
|
71 |
return WhisperInference(
|
72 |
model_dir=whisper_model_dir,
|
73 |
output_dir=output_dir,
|
74 |
diarization_model_dir=diarization_model_dir,
|
75 |
uvr_model_dir=uvr_model_dir
|
76 |
)
|
77 |
-
elif whisper_type
|
78 |
return InsanelyFastWhisperInference(
|
79 |
model_dir=insanely_fast_whisper_model_dir,
|
80 |
output_dir=output_dir,
|
|
|
6 |
from modules.whisper.faster_whisper_inference import FasterWhisperInference
|
7 |
from modules.whisper.whisper_Inference import WhisperInference
|
8 |
from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
|
9 |
+
from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
|
10 |
+
from modules.whisper.data_classes import *
|
11 |
|
12 |
|
13 |
class WhisperFactory:
|
|
|
20 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
21 |
uvr_model_dir: str = UVR_MODELS_DIR,
|
22 |
output_dir: str = OUTPUT_DIR,
|
23 |
+
) -> "BaseTranscriptionPipeline":
|
24 |
"""
|
25 |
Create a whisper inference class based on the provided whisper_type.
|
26 |
|
|
|
46 |
|
47 |
Returns
|
48 |
-------
|
49 |
+
BaseTranscriptionPipeline
|
50 |
An instance of the appropriate whisper inference class based on the whisper_type.
|
51 |
"""
|
52 |
# Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
|
53 |
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
54 |
|
55 |
+
whisper_type = whisper_type.strip().lower()
|
56 |
|
57 |
+
if whisper_type == WhisperImpl.FASTER_WHISPER.value:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
return FasterWhisperInference(
|
59 |
model_dir=faster_whisper_model_dir,
|
60 |
output_dir=output_dir,
|
61 |
diarization_model_dir=diarization_model_dir,
|
62 |
uvr_model_dir=uvr_model_dir
|
63 |
)
|
64 |
+
elif whisper_type == WhisperImpl.WHISPER.value:
|
65 |
return WhisperInference(
|
66 |
model_dir=whisper_model_dir,
|
67 |
output_dir=output_dir,
|
68 |
diarization_model_dir=diarization_model_dir,
|
69 |
uvr_model_dir=uvr_model_dir
|
70 |
)
|
71 |
+
elif whisper_type == WhisperImpl.INSANELY_FAST_WHISPER.value:
|
72 |
return InsanelyFastWhisperInference(
|
73 |
model_dir=insanely_fast_whisper_model_dir,
|
74 |
output_dir=output_dir,
|
modules/whisper/whisper_parameter.py
DELETED
@@ -1,369 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass, fields
|
2 |
-
import gradio as gr
|
3 |
-
from typing import Optional, Dict
|
4 |
-
import yaml
|
5 |
-
|
6 |
-
|
7 |
-
@dataclass
|
8 |
-
class WhisperParameters:
|
9 |
-
model_size: gr.Dropdown
|
10 |
-
lang: gr.Dropdown
|
11 |
-
is_translate: gr.Checkbox
|
12 |
-
beam_size: gr.Number
|
13 |
-
log_prob_threshold: gr.Number
|
14 |
-
no_speech_threshold: gr.Number
|
15 |
-
compute_type: gr.Dropdown
|
16 |
-
best_of: gr.Number
|
17 |
-
patience: gr.Number
|
18 |
-
condition_on_previous_text: gr.Checkbox
|
19 |
-
prompt_reset_on_temperature: gr.Slider
|
20 |
-
initial_prompt: gr.Textbox
|
21 |
-
temperature: gr.Slider
|
22 |
-
compression_ratio_threshold: gr.Number
|
23 |
-
vad_filter: gr.Checkbox
|
24 |
-
threshold: gr.Slider
|
25 |
-
min_speech_duration_ms: gr.Number
|
26 |
-
max_speech_duration_s: gr.Number
|
27 |
-
min_silence_duration_ms: gr.Number
|
28 |
-
speech_pad_ms: gr.Number
|
29 |
-
batch_size: gr.Number
|
30 |
-
is_diarize: gr.Checkbox
|
31 |
-
hf_token: gr.Textbox
|
32 |
-
diarization_device: gr.Dropdown
|
33 |
-
length_penalty: gr.Number
|
34 |
-
repetition_penalty: gr.Number
|
35 |
-
no_repeat_ngram_size: gr.Number
|
36 |
-
prefix: gr.Textbox
|
37 |
-
suppress_blank: gr.Checkbox
|
38 |
-
suppress_tokens: gr.Textbox
|
39 |
-
max_initial_timestamp: gr.Number
|
40 |
-
word_timestamps: gr.Checkbox
|
41 |
-
prepend_punctuations: gr.Textbox
|
42 |
-
append_punctuations: gr.Textbox
|
43 |
-
max_new_tokens: gr.Number
|
44 |
-
chunk_length: gr.Number
|
45 |
-
hallucination_silence_threshold: gr.Number
|
46 |
-
hotwords: gr.Textbox
|
47 |
-
language_detection_threshold: gr.Number
|
48 |
-
language_detection_segments: gr.Number
|
49 |
-
is_bgm_separate: gr.Checkbox
|
50 |
-
uvr_model_size: gr.Dropdown
|
51 |
-
uvr_device: gr.Dropdown
|
52 |
-
uvr_segment_size: gr.Number
|
53 |
-
uvr_save_file: gr.Checkbox
|
54 |
-
uvr_enable_offload: gr.Checkbox
|
55 |
-
"""
|
56 |
-
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
57 |
-
This data class is used to mitigate the key-value problem between Gradio components and function parameters.
|
58 |
-
Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
|
59 |
-
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
60 |
-
|
61 |
-
Attributes
|
62 |
-
----------
|
63 |
-
model_size: gr.Dropdown
|
64 |
-
Whisper model size.
|
65 |
-
|
66 |
-
lang: gr.Dropdown
|
67 |
-
Source language of the file to transcribe.
|
68 |
-
|
69 |
-
is_translate: gr.Checkbox
|
70 |
-
Boolean value that determines whether to translate to English.
|
71 |
-
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
72 |
-
|
73 |
-
beam_size: gr.Number
|
74 |
-
Int value that is used for decoding option.
|
75 |
-
|
76 |
-
log_prob_threshold: gr.Number
|
77 |
-
If the average log probability over sampled tokens is below this value, treat as failed.
|
78 |
-
|
79 |
-
no_speech_threshold: gr.Number
|
80 |
-
If the no_speech probability is higher than this value AND
|
81 |
-
the average log probability over sampled tokens is below `log_prob_threshold`,
|
82 |
-
consider the segment as silent.
|
83 |
-
|
84 |
-
compute_type: gr.Dropdown
|
85 |
-
compute type for transcription.
|
86 |
-
see more info : https://opennmt.net/CTranslate2/quantization.html
|
87 |
-
|
88 |
-
best_of: gr.Number
|
89 |
-
Number of candidates when sampling with non-zero temperature.
|
90 |
-
|
91 |
-
patience: gr.Number
|
92 |
-
Beam search patience factor.
|
93 |
-
|
94 |
-
condition_on_previous_text: gr.Checkbox
|
95 |
-
if True, the previous output of the model is provided as a prompt for the next window;
|
96 |
-
disabling may make the text inconsistent across windows, but the model becomes less prone to
|
97 |
-
getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
|
98 |
-
|
99 |
-
initial_prompt: gr.Textbox
|
100 |
-
Optional text to provide as a prompt for the first window. This can be used to provide, or
|
101 |
-
"prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
|
102 |
-
to make it more likely to predict those word correctly.
|
103 |
-
|
104 |
-
temperature: gr.Slider
|
105 |
-
Temperature for sampling. It can be a tuple of temperatures,
|
106 |
-
which will be successively used upon failures according to either
|
107 |
-
`compression_ratio_threshold` or `log_prob_threshold`.
|
108 |
-
|
109 |
-
compression_ratio_threshold: gr.Number
|
110 |
-
If the gzip compression ratio is above this value, treat as failed
|
111 |
-
|
112 |
-
vad_filter: gr.Checkbox
|
113 |
-
Enable the voice activity detection (VAD) to filter out parts of the audio
|
114 |
-
without speech. This step is using the Silero VAD model
|
115 |
-
https://github.com/snakers4/silero-vad.
|
116 |
-
|
117 |
-
threshold: gr.Slider
|
118 |
-
This parameter is related with Silero VAD. Speech threshold.
|
119 |
-
Silero VAD outputs speech probabilities for each audio chunk,
|
120 |
-
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
|
121 |
-
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
122 |
-
|
123 |
-
min_speech_duration_ms: gr.Number
|
124 |
-
This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
|
125 |
-
|
126 |
-
max_speech_duration_s: gr.Number
|
127 |
-
This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
|
128 |
-
than max_speech_duration_s will be split at the timestamp of the last silence that
|
129 |
-
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
|
130 |
-
split aggressively just before max_speech_duration_s.
|
131 |
-
|
132 |
-
min_silence_duration_ms: gr.Number
|
133 |
-
This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
|
134 |
-
before separating it
|
135 |
-
|
136 |
-
speech_pad_ms: gr.Number
|
137 |
-
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
|
138 |
-
|
139 |
-
batch_size: gr.Number
|
140 |
-
This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
|
141 |
-
|
142 |
-
is_diarize: gr.Checkbox
|
143 |
-
This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
|
144 |
-
|
145 |
-
hf_token: gr.Textbox
|
146 |
-
This parameter is related with whisperx. Huggingface token is needed to download diarization models.
|
147 |
-
Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
|
148 |
-
|
149 |
-
diarization_device: gr.Dropdown
|
150 |
-
This parameter is related with whisperx. Device to run diarization model
|
151 |
-
|
152 |
-
length_penalty: gr.Number
|
153 |
-
This parameter is related to faster-whisper. Exponential length penalty constant.
|
154 |
-
|
155 |
-
repetition_penalty: gr.Number
|
156 |
-
This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
|
157 |
-
(set > 1 to penalize).
|
158 |
-
|
159 |
-
no_repeat_ngram_size: gr.Number
|
160 |
-
This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
|
161 |
-
|
162 |
-
prefix: gr.Textbox
|
163 |
-
This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
|
164 |
-
|
165 |
-
suppress_blank: gr.Checkbox
|
166 |
-
This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
|
167 |
-
|
168 |
-
suppress_tokens: gr.Textbox
|
169 |
-
This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
|
170 |
-
of symbols as defined in the model config.json file.
|
171 |
-
|
172 |
-
max_initial_timestamp: gr.Number
|
173 |
-
This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
|
174 |
-
|
175 |
-
word_timestamps: gr.Checkbox
|
176 |
-
This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
|
177 |
-
and dynamic time warping, and include the timestamps for each word in each segment.
|
178 |
-
|
179 |
-
prepend_punctuations: gr.Textbox
|
180 |
-
This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
|
181 |
-
with the next word.
|
182 |
-
|
183 |
-
append_punctuations: gr.Textbox
|
184 |
-
This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
|
185 |
-
with the previous word.
|
186 |
-
|
187 |
-
max_new_tokens: gr.Number
|
188 |
-
This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
|
189 |
-
the maximum will be set by the default max_length.
|
190 |
-
|
191 |
-
chunk_length: gr.Number
|
192 |
-
This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
|
193 |
-
If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
|
194 |
-
|
195 |
-
hallucination_silence_threshold: gr.Number
|
196 |
-
This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
|
197 |
-
(in seconds) when a possible hallucination is detected.
|
198 |
-
|
199 |
-
hotwords: gr.Textbox
|
200 |
-
This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
|
201 |
-
|
202 |
-
language_detection_threshold: gr.Number
|
203 |
-
This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
|
204 |
-
|
205 |
-
language_detection_segments: gr.Number
|
206 |
-
This parameter is related to faster-whisper. Number of segments to consider for the language detection.
|
207 |
-
|
208 |
-
is_separate_bgm: gr.Checkbox
|
209 |
-
This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
|
210 |
-
|
211 |
-
uvr_model_size: gr.Dropdown
|
212 |
-
This parameter is related to UVR. UVR model size.
|
213 |
-
|
214 |
-
uvr_device: gr.Dropdown
|
215 |
-
This parameter is related to UVR. Device to run UVR model.
|
216 |
-
|
217 |
-
uvr_segment_size: gr.Number
|
218 |
-
This parameter is related to UVR. Segment size for UVR model.
|
219 |
-
|
220 |
-
uvr_save_file: gr.Checkbox
|
221 |
-
This parameter is related to UVR. Boolean value that determines whether to save the file or not.
|
222 |
-
|
223 |
-
uvr_enable_offload: gr.Checkbox
|
224 |
-
This parameter is related to UVR. Boolean value that determines whether to offload the UVR model or not
|
225 |
-
after each transcription.
|
226 |
-
"""
|
227 |
-
|
228 |
-
def as_list(self) -> list:
|
229 |
-
"""
|
230 |
-
Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
|
231 |
-
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|
232 |
-
|
233 |
-
Returns
|
234 |
-
----------
|
235 |
-
A list of Gradio components
|
236 |
-
"""
|
237 |
-
return [getattr(self, f.name) for f in fields(self)]
|
238 |
-
|
239 |
-
@staticmethod
|
240 |
-
def as_value(*args) -> 'WhisperValues':
|
241 |
-
"""
|
242 |
-
To use Whisper parameters in function after Gradio post-processing.
|
243 |
-
See more about Gradio post-processing: : https://www.gradio.app/docs/components
|
244 |
-
|
245 |
-
Returns
|
246 |
-
----------
|
247 |
-
WhisperValues
|
248 |
-
Data class that has values of parameters
|
249 |
-
"""
|
250 |
-
return WhisperValues(*args)
|
251 |
-
|
252 |
-
|
253 |
-
@dataclass
|
254 |
-
class WhisperValues:
|
255 |
-
model_size: str = "large-v2"
|
256 |
-
lang: Optional[str] = None
|
257 |
-
is_translate: bool = False
|
258 |
-
beam_size: int = 5
|
259 |
-
log_prob_threshold: float = -1.0
|
260 |
-
no_speech_threshold: float = 0.6
|
261 |
-
compute_type: str = "float16"
|
262 |
-
best_of: int = 5
|
263 |
-
patience: float = 1.0
|
264 |
-
condition_on_previous_text: bool = True
|
265 |
-
prompt_reset_on_temperature: float = 0.5
|
266 |
-
initial_prompt: Optional[str] = None
|
267 |
-
temperature: float = 0.0
|
268 |
-
compression_ratio_threshold: float = 2.4
|
269 |
-
vad_filter: bool = False
|
270 |
-
threshold: float = 0.5
|
271 |
-
min_speech_duration_ms: int = 250
|
272 |
-
max_speech_duration_s: float = float("inf")
|
273 |
-
min_silence_duration_ms: int = 2000
|
274 |
-
speech_pad_ms: int = 400
|
275 |
-
batch_size: int = 24
|
276 |
-
is_diarize: bool = False
|
277 |
-
hf_token: str = ""
|
278 |
-
diarization_device: str = "cuda"
|
279 |
-
length_penalty: float = 1.0
|
280 |
-
repetition_penalty: float = 1.0
|
281 |
-
no_repeat_ngram_size: int = 0
|
282 |
-
prefix: Optional[str] = None
|
283 |
-
suppress_blank: bool = True
|
284 |
-
suppress_tokens: Optional[str] = "[-1]"
|
285 |
-
max_initial_timestamp: float = 0.0
|
286 |
-
word_timestamps: bool = False
|
287 |
-
prepend_punctuations: Optional[str] = "\"'“¿([{-"
|
288 |
-
append_punctuations: Optional[str] = "\"'.。,,!!??::”)]}、"
|
289 |
-
max_new_tokens: Optional[int] = None
|
290 |
-
chunk_length: Optional[int] = 30
|
291 |
-
hallucination_silence_threshold: Optional[float] = None
|
292 |
-
hotwords: Optional[str] = None
|
293 |
-
language_detection_threshold: Optional[float] = None
|
294 |
-
language_detection_segments: int = 1
|
295 |
-
is_bgm_separate: bool = False
|
296 |
-
uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
|
297 |
-
uvr_device: str = "cuda"
|
298 |
-
uvr_segment_size: int = 256
|
299 |
-
uvr_save_file: bool = False
|
300 |
-
uvr_enable_offload: bool = True
|
301 |
-
"""
|
302 |
-
A data class to use Whisper parameters.
|
303 |
-
"""
|
304 |
-
|
305 |
-
def to_yaml(self) -> Dict:
|
306 |
-
data = {
|
307 |
-
"whisper": {
|
308 |
-
"model_size": self.model_size,
|
309 |
-
"lang": "Automatic Detection" if self.lang is None else self.lang,
|
310 |
-
"is_translate": self.is_translate,
|
311 |
-
"beam_size": self.beam_size,
|
312 |
-
"log_prob_threshold": self.log_prob_threshold,
|
313 |
-
"no_speech_threshold": self.no_speech_threshold,
|
314 |
-
"best_of": self.best_of,
|
315 |
-
"patience": self.patience,
|
316 |
-
"condition_on_previous_text": self.condition_on_previous_text,
|
317 |
-
"prompt_reset_on_temperature": self.prompt_reset_on_temperature,
|
318 |
-
"initial_prompt": None if not self.initial_prompt else self.initial_prompt,
|
319 |
-
"temperature": self.temperature,
|
320 |
-
"compression_ratio_threshold": self.compression_ratio_threshold,
|
321 |
-
"batch_size": self.batch_size,
|
322 |
-
"length_penalty": self.length_penalty,
|
323 |
-
"repetition_penalty": self.repetition_penalty,
|
324 |
-
"no_repeat_ngram_size": self.no_repeat_ngram_size,
|
325 |
-
"prefix": None if not self.prefix else self.prefix,
|
326 |
-
"suppress_blank": self.suppress_blank,
|
327 |
-
"suppress_tokens": self.suppress_tokens,
|
328 |
-
"max_initial_timestamp": self.max_initial_timestamp,
|
329 |
-
"word_timestamps": self.word_timestamps,
|
330 |
-
"prepend_punctuations": self.prepend_punctuations,
|
331 |
-
"append_punctuations": self.append_punctuations,
|
332 |
-
"max_new_tokens": self.max_new_tokens,
|
333 |
-
"chunk_length": self.chunk_length,
|
334 |
-
"hallucination_silence_threshold": self.hallucination_silence_threshold,
|
335 |
-
"hotwords": None if not self.hotwords else self.hotwords,
|
336 |
-
"language_detection_threshold": self.language_detection_threshold,
|
337 |
-
"language_detection_segments": self.language_detection_segments,
|
338 |
-
},
|
339 |
-
"vad": {
|
340 |
-
"vad_filter": self.vad_filter,
|
341 |
-
"threshold": self.threshold,
|
342 |
-
"min_speech_duration_ms": self.min_speech_duration_ms,
|
343 |
-
"max_speech_duration_s": self.max_speech_duration_s,
|
344 |
-
"min_silence_duration_ms": self.min_silence_duration_ms,
|
345 |
-
"speech_pad_ms": self.speech_pad_ms,
|
346 |
-
},
|
347 |
-
"diarization": {
|
348 |
-
"is_diarize": self.is_diarize,
|
349 |
-
"hf_token": self.hf_token
|
350 |
-
},
|
351 |
-
"bgm_separation": {
|
352 |
-
"is_separate_bgm": self.is_bgm_separate,
|
353 |
-
"model_size": self.uvr_model_size,
|
354 |
-
"segment_size": self.uvr_segment_size,
|
355 |
-
"save_file": self.uvr_save_file,
|
356 |
-
"enable_offload": self.uvr_enable_offload
|
357 |
-
},
|
358 |
-
}
|
359 |
-
return data
|
360 |
-
|
361 |
-
def as_list(self) -> list:
|
362 |
-
"""
|
363 |
-
Converts the data class attributes into a list
|
364 |
-
|
365 |
-
Returns
|
366 |
-
----------
|
367 |
-
A list of Whisper parameters
|
368 |
-
"""
|
369 |
-
return [getattr(self, f.name) for f in fields(self)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebook/whisper-webui.ipynb
CHANGED
@@ -54,7 +54,9 @@
|
|
54 |
"%cd Whisper-WebUI\n",
|
55 |
"!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
|
56 |
"!pip install faster-whisper==1.0.3\n",
|
57 |
-
"!pip install
|
|
|
|
|
58 |
"# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
|
59 |
"!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
|
60 |
"!pip install tokenizers==0.19.1\n",
|
|
|
54 |
"%cd Whisper-WebUI\n",
|
55 |
"!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
|
56 |
"!pip install faster-whisper==1.0.3\n",
|
57 |
+
"!pip install ctranslate2==4.4.0\n",
|
58 |
+
"!pip install gradio\n",
|
59 |
+
"!pip install gradio-i18n\n",
|
60 |
"# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
|
61 |
"!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
|
62 |
"!pip install tokenizers==0.19.1\n",
|
requirements.txt
CHANGED
@@ -2,15 +2,16 @@
|
|
2 |
# If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
|
3 |
# For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
|
4 |
# For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
|
5 |
-
--extra-index-url https://download.pytorch.org/whl/
|
6 |
|
7 |
|
8 |
-
torch
|
9 |
-
torchaudio
|
10 |
git+https://github.com/jhj0517/jhj0517-whisper.git
|
11 |
faster-whisper==1.0.3
|
12 |
transformers
|
13 |
gradio
|
|
|
14 |
pytubefix
|
15 |
ruamel.yaml==0.18.6
|
16 |
pyannote.audio==3.3.1
|
|
|
2 |
# If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
|
3 |
# For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
|
4 |
# For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
|
5 |
+
--extra-index-url https://download.pytorch.org/whl/cu124
|
6 |
|
7 |
|
8 |
+
torch
|
9 |
+
torchaudio
|
10 |
git+https://github.com/jhj0517/jhj0517-whisper.git
|
11 |
faster-whisper==1.0.3
|
12 |
transformers
|
13 |
gradio
|
14 |
+
gradio-i18n
|
15 |
pytubefix
|
16 |
ruamel.yaml==0.18.6
|
17 |
pyannote.audio==3.3.1
|
screenshot.png
CHANGED
tests/test_bgm_separation.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from modules.utils.paths import *
|
2 |
from modules.whisper.whisper_factory import WhisperFactory
|
3 |
-
from modules.whisper.
|
4 |
from test_config import *
|
5 |
from test_transcription import download_file, test_transcribe
|
6 |
|
@@ -17,9 +17,9 @@ import os
|
|
17 |
@pytest.mark.parametrize(
|
18 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
19 |
[
|
20 |
-
(
|
21 |
-
(
|
22 |
-
(
|
23 |
]
|
24 |
)
|
25 |
def test_bgm_separation_pipeline(
|
@@ -38,9 +38,9 @@ def test_bgm_separation_pipeline(
|
|
38 |
@pytest.mark.parametrize(
|
39 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
40 |
[
|
41 |
-
(
|
42 |
-
(
|
43 |
-
(
|
44 |
]
|
45 |
)
|
46 |
def test_bgm_separation_with_vad_pipeline(
|
|
|
1 |
from modules.utils.paths import *
|
2 |
from modules.whisper.whisper_factory import WhisperFactory
|
3 |
+
from modules.whisper.data_classes import *
|
4 |
from test_config import *
|
5 |
from test_transcription import download_file, test_transcribe
|
6 |
|
|
|
17 |
@pytest.mark.parametrize(
|
18 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
19 |
[
|
20 |
+
(WhisperImpl.WHISPER.value, False, True, False),
|
21 |
+
(WhisperImpl.FASTER_WHISPER.value, False, True, False),
|
22 |
+
(WhisperImpl.INSANELY_FAST_WHISPER.value, False, True, False)
|
23 |
]
|
24 |
)
|
25 |
def test_bgm_separation_pipeline(
|
|
|
38 |
@pytest.mark.parametrize(
|
39 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
40 |
[
|
41 |
+
(WhisperImpl.WHISPER.value, True, True, False),
|
42 |
+
(WhisperImpl.FASTER_WHISPER.value, True, True, False),
|
43 |
+
(WhisperImpl.INSANELY_FAST_WHISPER.value, True, True, False)
|
44 |
]
|
45 |
)
|
46 |
def test_bgm_separation_with_vad_pipeline(
|
tests/test_config.py
CHANGED
@@ -1,10 +1,14 @@
|
|
1 |
-
|
2 |
-
|
3 |
import os
|
4 |
import torch
|
5 |
|
|
|
|
|
|
|
6 |
TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
|
7 |
TEST_FILE_PATH = os.path.join(WEBUI_DIR, "tests", "jfk.wav")
|
|
|
8 |
TEST_YOUTUBE_URL = "https://www.youtube.com/watch?v=4WEQtgnBu0I&ab_channel=AndriaFitzer"
|
9 |
TEST_WHISPER_MODEL = "tiny"
|
10 |
TEST_UVR_MODEL = "UVR-MDX-NET-Inst_HQ_4"
|
@@ -13,5 +17,24 @@ TEST_SUBTITLE_SRT_PATH = os.path.join(WEBUI_DIR, "tests", "test_srt.srt")
|
|
13 |
TEST_SUBTITLE_VTT_PATH = os.path.join(WEBUI_DIR, "tests", "test_vtt.vtt")
|
14 |
|
15 |
|
|
|
16 |
def is_cuda_available():
|
17 |
return torch.cuda.is_available()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import functools
|
2 |
+
import jiwer
|
3 |
import os
|
4 |
import torch
|
5 |
|
6 |
+
from modules.utils.paths import *
|
7 |
+
from modules.utils.youtube_manager import *
|
8 |
+
|
9 |
TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
|
10 |
TEST_FILE_PATH = os.path.join(WEBUI_DIR, "tests", "jfk.wav")
|
11 |
+
TEST_ANSWER = "And so my fellow Americans ask not what your country can do for you ask what you can do for your country"
|
12 |
TEST_YOUTUBE_URL = "https://www.youtube.com/watch?v=4WEQtgnBu0I&ab_channel=AndriaFitzer"
|
13 |
TEST_WHISPER_MODEL = "tiny"
|
14 |
TEST_UVR_MODEL = "UVR-MDX-NET-Inst_HQ_4"
|
|
|
17 |
TEST_SUBTITLE_VTT_PATH = os.path.join(WEBUI_DIR, "tests", "test_vtt.vtt")
|
18 |
|
19 |
|
20 |
+
@functools.lru_cache
|
21 |
def is_cuda_available():
|
22 |
return torch.cuda.is_available()
|
23 |
+
|
24 |
+
|
25 |
+
@functools.lru_cache
|
26 |
+
def is_pytube_detected_bot(url: str = TEST_YOUTUBE_URL):
|
27 |
+
try:
|
28 |
+
yt_temp_path = os.path.join("modules", "yt_tmp.wav")
|
29 |
+
if os.path.exists(yt_temp_path):
|
30 |
+
return False
|
31 |
+
yt = get_ytdata(url)
|
32 |
+
audio = get_ytaudio(yt)
|
33 |
+
return False
|
34 |
+
except Exception as e:
|
35 |
+
print(f"Pytube has detected as a bot: {e}")
|
36 |
+
return True
|
37 |
+
|
38 |
+
|
39 |
+
def calculate_wer(answer, prediction):
|
40 |
+
return jiwer.wer(answer, prediction)
|
tests/test_diarization.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from modules.utils.paths import *
|
2 |
from modules.whisper.whisper_factory import WhisperFactory
|
3 |
-
from modules.whisper.
|
4 |
from test_config import *
|
5 |
from test_transcription import download_file, test_transcribe
|
6 |
|
@@ -16,9 +16,9 @@ import os
|
|
16 |
@pytest.mark.parametrize(
|
17 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
18 |
[
|
19 |
-
(
|
20 |
-
(
|
21 |
-
(
|
22 |
]
|
23 |
)
|
24 |
def test_diarization_pipeline(
|
|
|
1 |
from modules.utils.paths import *
|
2 |
from modules.whisper.whisper_factory import WhisperFactory
|
3 |
+
from modules.whisper.data_classes import *
|
4 |
from test_config import *
|
5 |
from test_transcription import download_file, test_transcribe
|
6 |
|
|
|
16 |
@pytest.mark.parametrize(
|
17 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
18 |
[
|
19 |
+
(WhisperImpl.WHISPER.value, False, False, True),
|
20 |
+
(WhisperImpl.FASTER_WHISPER.value, False, False, True),
|
21 |
+
(WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, True)
|
22 |
]
|
23 |
)
|
24 |
def test_diarization_pipeline(
|
tests/test_transcription.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from modules.whisper.whisper_factory import WhisperFactory
|
2 |
-
from modules.whisper.
|
|
|
3 |
from modules.utils.paths import WEBUI_DIR
|
4 |
from test_config import *
|
5 |
|
@@ -12,9 +13,9 @@ import os
|
|
12 |
@pytest.mark.parametrize(
|
13 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
14 |
[
|
15 |
-
(
|
16 |
-
(
|
17 |
-
(
|
18 |
]
|
19 |
)
|
20 |
def test_transcribe(
|
@@ -28,6 +29,10 @@ def test_transcribe(
|
|
28 |
if not os.path.exists(audio_path):
|
29 |
download_file(TEST_FILE_DOWNLOAD_URL, audio_path_dir)
|
30 |
|
|
|
|
|
|
|
|
|
31 |
whisper_inferencer = WhisperFactory.create_whisper_inference(
|
32 |
whisper_type=whisper_type,
|
33 |
)
|
@@ -37,16 +42,24 @@ def test_transcribe(
|
|
37 |
f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
|
38 |
)
|
39 |
|
40 |
-
hparams =
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
[audio_path],
|
51 |
None,
|
52 |
"SRT",
|
@@ -54,29 +67,29 @@ def test_transcribe(
|
|
54 |
gr.Progress(),
|
55 |
*hparams,
|
56 |
)
|
57 |
-
|
58 |
-
assert
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
whisper_inferencer.transcribe_mic(
|
72 |
audio_path,
|
73 |
"SRT",
|
74 |
False,
|
75 |
gr.Progress(),
|
76 |
*hparams,
|
77 |
)
|
78 |
-
|
79 |
-
assert
|
80 |
|
81 |
|
82 |
def download_file(url, save_dir):
|
|
|
1 |
from modules.whisper.whisper_factory import WhisperFactory
|
2 |
+
from modules.whisper.data_classes import *
|
3 |
+
from modules.utils.subtitle_manager import read_file
|
4 |
from modules.utils.paths import WEBUI_DIR
|
5 |
from test_config import *
|
6 |
|
|
|
13 |
@pytest.mark.parametrize(
|
14 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
15 |
[
|
16 |
+
(WhisperImpl.WHISPER.value, False, False, False),
|
17 |
+
(WhisperImpl.FASTER_WHISPER.value, False, False, False),
|
18 |
+
(WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, False)
|
19 |
]
|
20 |
)
|
21 |
def test_transcribe(
|
|
|
29 |
if not os.path.exists(audio_path):
|
30 |
download_file(TEST_FILE_DOWNLOAD_URL, audio_path_dir)
|
31 |
|
32 |
+
answer = TEST_ANSWER
|
33 |
+
if diarization:
|
34 |
+
answer = "SPEAKER_00|"+TEST_ANSWER
|
35 |
+
|
36 |
whisper_inferencer = WhisperFactory.create_whisper_inference(
|
37 |
whisper_type=whisper_type,
|
38 |
)
|
|
|
42 |
f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
|
43 |
)
|
44 |
|
45 |
+
hparams = TranscriptionPipelineParams(
|
46 |
+
whisper=WhisperParams(
|
47 |
+
model_size=TEST_WHISPER_MODEL,
|
48 |
+
compute_type=whisper_inferencer.current_compute_type
|
49 |
+
),
|
50 |
+
vad=VadParams(
|
51 |
+
vad_filter=vad_filter
|
52 |
+
),
|
53 |
+
bgm_separation=BGMSeparationParams(
|
54 |
+
is_separate_bgm=bgm_separation,
|
55 |
+
enable_offload=True
|
56 |
+
),
|
57 |
+
diarization=DiarizationParams(
|
58 |
+
is_diarize=diarization
|
59 |
+
),
|
60 |
+
).to_list()
|
61 |
+
|
62 |
+
subtitle_str, file_paths = whisper_inferencer.transcribe_file(
|
63 |
[audio_path],
|
64 |
None,
|
65 |
"SRT",
|
|
|
67 |
gr.Progress(),
|
68 |
*hparams,
|
69 |
)
|
70 |
+
subtitle = read_file(file_paths[0]).split("\n")
|
71 |
+
assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
|
72 |
+
|
73 |
+
if not is_pytube_detected_bot():
|
74 |
+
subtitle_str, file_path = whisper_inferencer.transcribe_youtube(
|
75 |
+
TEST_YOUTUBE_URL,
|
76 |
+
"SRT",
|
77 |
+
False,
|
78 |
+
gr.Progress(),
|
79 |
+
*hparams,
|
80 |
+
)
|
81 |
+
assert isinstance(subtitle_str, str) and subtitle_str
|
82 |
+
assert os.path.exists(file_path)
|
83 |
+
|
84 |
+
subtitle_str, file_path = whisper_inferencer.transcribe_mic(
|
85 |
audio_path,
|
86 |
"SRT",
|
87 |
False,
|
88 |
gr.Progress(),
|
89 |
*hparams,
|
90 |
)
|
91 |
+
subtitle = read_file(file_path).split("\n")
|
92 |
+
assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
|
93 |
|
94 |
|
95 |
def download_file(url, save_dir):
|
tests/test_translation.py
CHANGED
@@ -28,6 +28,10 @@ def test_nllb_inference(
|
|
28 |
assert isinstance(file_paths[0], str)
|
29 |
|
30 |
|
|
|
|
|
|
|
|
|
31 |
@pytest.mark.parametrize("file_path", [
|
32 |
TEST_SUBTITLE_SRT_PATH,
|
33 |
TEST_SUBTITLE_VTT_PATH,
|
|
|
28 |
assert isinstance(file_paths[0], str)
|
29 |
|
30 |
|
31 |
+
@pytest.mark.skipif(
|
32 |
+
os.getenv("DEEPL_API_KEY") is None or not os.getenv("DEEPL_API_KEY"),
|
33 |
+
reason="DeepL API key is unavailable"
|
34 |
+
)
|
35 |
@pytest.mark.parametrize("file_path", [
|
36 |
TEST_SUBTITLE_SRT_PATH,
|
37 |
TEST_SUBTITLE_VTT_PATH,
|
tests/test_vad.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from modules.utils.paths import *
|
2 |
from modules.whisper.whisper_factory import WhisperFactory
|
3 |
-
from modules.whisper.
|
4 |
from test_config import *
|
5 |
from test_transcription import download_file, test_transcribe
|
6 |
|
@@ -12,9 +12,9 @@ import os
|
|
12 |
@pytest.mark.parametrize(
|
13 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
14 |
[
|
15 |
-
(
|
16 |
-
(
|
17 |
-
(
|
18 |
]
|
19 |
)
|
20 |
def test_vad_pipeline(
|
|
|
1 |
from modules.utils.paths import *
|
2 |
from modules.whisper.whisper_factory import WhisperFactory
|
3 |
+
from modules.whisper.data_classes import *
|
4 |
from test_config import *
|
5 |
from test_transcription import download_file, test_transcribe
|
6 |
|
|
|
12 |
@pytest.mark.parametrize(
|
13 |
"whisper_type,vad_filter,bgm_separation,diarization",
|
14 |
[
|
15 |
+
(WhisperImpl.WHISPER.value, True, False, False),
|
16 |
+
(WhisperImpl.FASTER_WHISPER.value, True, False, False),
|
17 |
+
(WhisperImpl.INSANELY_FAST_WHISPER.value, True, False, False)
|
18 |
]
|
19 |
)
|
20 |
def test_vad_pipeline(
|