Spaces:
Running
on
T4
Running
on
T4
jhj0517
commited on
Commit
•
abd7185
1
Parent(s):
78d87d3
initial_commit
Browse files- .gitignore +7 -0
- Install.bat +22 -0
- Install.sh +21 -0
- LICENSE +201 -0
- app.py +241 -0
- modules/__init__.py +0 -0
- modules/base_interface.py +21 -0
- modules/deepl_api.py +196 -0
- modules/faster_whisper_inference.py +438 -0
- modules/nllb_inference.py +345 -0
- modules/subtitle_manager.py +135 -0
- modules/whisper_Inference.py +424 -0
- modules/youtube_manager.py +15 -0
- notebook/whisper-webui.ipynb +125 -0
- requirements.txt +7 -0
- screenshot.png +0 -0
- start-webui.bat +18 -0
- start-webui.sh +12 -0
- ui/__init__.py +0 -0
- ui/htmls.py +97 -0
- user-start-webui.bat +41 -0
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
venv/
|
2 |
+
ui/__pycache__/
|
3 |
+
outputs/
|
4 |
+
modules/__pycache__/
|
5 |
+
models/
|
6 |
+
modules/yt_tmp.wav
|
7 |
+
.idea/
|
Install.bat
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@echo off
|
2 |
+
|
3 |
+
if not exist "%~dp0\venv\Scripts" (
|
4 |
+
echo Creating venv...
|
5 |
+
python -m venv venv
|
6 |
+
)
|
7 |
+
|
8 |
+
echo checked the venv folder. now installing requirements..
|
9 |
+
cd /d "%~dp0\venv\Scripts"
|
10 |
+
call activate.bat
|
11 |
+
|
12 |
+
cd /d "%~dp0"
|
13 |
+
pip install -r requirements.txt
|
14 |
+
|
15 |
+
if errorlevel 1 (
|
16 |
+
echo.
|
17 |
+
echo Requirements installation failed. please remove venv folder and run install.bat again.
|
18 |
+
) else (
|
19 |
+
echo.
|
20 |
+
echo Requirements installed successfully.
|
21 |
+
)
|
22 |
+
pause
|
Install.sh
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
if [ ! -d "venv" ]; then
|
4 |
+
echo "Creating virtual environment..."
|
5 |
+
python -m venv venv
|
6 |
+
fi
|
7 |
+
|
8 |
+
source venv/bin/activate
|
9 |
+
pip install -r requirements.txt
|
10 |
+
|
11 |
+
if [ $? -ne 0 ]; then
|
12 |
+
echo ""
|
13 |
+
echo "Requirements installation failed. please remove venv folder and run install.sh again."
|
14 |
+
deactivate
|
15 |
+
exit 1
|
16 |
+
fi
|
17 |
+
|
18 |
+
echo ""
|
19 |
+
echo "Requirements installed successfully."
|
20 |
+
|
21 |
+
deactivate
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright 2023 jhj0517
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
app.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
from modules.whisper_Inference import WhisperInference
|
6 |
+
from modules.faster_whisper_inference import FasterWhisperInference
|
7 |
+
from modules.nllb_inference import NLLBInference
|
8 |
+
from ui.htmls import *
|
9 |
+
from modules.youtube_manager import get_ytmetas
|
10 |
+
from modules.deepl_api import DeepLAPI
|
11 |
+
|
12 |
+
class App:
|
13 |
+
def __init__(self, args):
|
14 |
+
self.args = args
|
15 |
+
self.app = gr.Blocks(css=CSS, theme=self.args.theme)
|
16 |
+
self.whisper_inf = WhisperInference() if self.args.disable_faster_whisper else FasterWhisperInference()
|
17 |
+
if isinstance(self.whisper_inf, FasterWhisperInference):
|
18 |
+
print("Use Faster Whisper implementation")
|
19 |
+
else:
|
20 |
+
print("Use Open AI Whisper implementation")
|
21 |
+
print(f"Device \"{self.whisper_inf.device}\" is detected")
|
22 |
+
self.nllb_inf = NLLBInference()
|
23 |
+
self.deepl_api = DeepLAPI()
|
24 |
+
|
25 |
+
@staticmethod
|
26 |
+
def open_folder(folder_path: str):
|
27 |
+
if os.path.exists(folder_path):
|
28 |
+
os.system(f"start {folder_path}")
|
29 |
+
else:
|
30 |
+
print(f"The folder {folder_path} does not exist.")
|
31 |
+
|
32 |
+
@staticmethod
|
33 |
+
def on_change_models(model_size: str):
|
34 |
+
translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
|
35 |
+
if model_size not in translatable_model:
|
36 |
+
return gr.Checkbox(visible=False, value=False, interactive=False)
|
37 |
+
else:
|
38 |
+
return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
|
39 |
+
|
40 |
+
def launch(self):
|
41 |
+
with self.app:
|
42 |
+
with gr.Row():
|
43 |
+
with gr.Column():
|
44 |
+
gr.Markdown(MARKDOWN, elem_id="md_project")
|
45 |
+
with gr.Tabs():
|
46 |
+
with gr.TabItem("File"): # tab1
|
47 |
+
with gr.Row():
|
48 |
+
input_file = gr.Files(type="filepath", label="Upload File here")
|
49 |
+
with gr.Row():
|
50 |
+
dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v3",
|
51 |
+
label="Model")
|
52 |
+
dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
|
53 |
+
value="Automatic Detection", label="Language")
|
54 |
+
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
55 |
+
with gr.Row():
|
56 |
+
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
57 |
+
with gr.Row():
|
58 |
+
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
59 |
+
with gr.Accordion("Advanced_Parameters", open=False):
|
60 |
+
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
61 |
+
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
62 |
+
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
63 |
+
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
|
64 |
+
with gr.Row():
|
65 |
+
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
66 |
+
with gr.Row():
|
67 |
+
tb_indicator = gr.Textbox(label="Output", scale=4)
|
68 |
+
files_subtitles = gr.Files(label="Downloadable output file", scale=4, interactive=False)
|
69 |
+
btn_openfolder = gr.Button('📂', scale=1)
|
70 |
+
|
71 |
+
params = [input_file, dd_model, dd_lang, dd_file_format, cb_translate, cb_timestamp]
|
72 |
+
advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
|
73 |
+
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
74 |
+
inputs=params + advanced_params,
|
75 |
+
outputs=[tb_indicator, files_subtitles])
|
76 |
+
btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
|
77 |
+
dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
|
78 |
+
|
79 |
+
with gr.TabItem("Youtube"): # tab2
|
80 |
+
with gr.Row():
|
81 |
+
tb_youtubelink = gr.Textbox(label="Youtube Link")
|
82 |
+
with gr.Row(equal_height=True):
|
83 |
+
with gr.Column():
|
84 |
+
img_thumbnail = gr.Image(label="Youtube Thumbnail")
|
85 |
+
with gr.Column():
|
86 |
+
tb_title = gr.Label(label="Youtube Title")
|
87 |
+
tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
|
88 |
+
with gr.Row():
|
89 |
+
dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v3",
|
90 |
+
label="Model")
|
91 |
+
dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
|
92 |
+
value="Automatic Detection", label="Language")
|
93 |
+
dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
94 |
+
with gr.Row():
|
95 |
+
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
96 |
+
with gr.Row():
|
97 |
+
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
98 |
+
interactive=True)
|
99 |
+
with gr.Accordion("Advanced_Parameters", open=False):
|
100 |
+
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
101 |
+
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
102 |
+
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
103 |
+
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
|
104 |
+
with gr.Row():
|
105 |
+
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
106 |
+
with gr.Row():
|
107 |
+
tb_indicator = gr.Textbox(label="Output", scale=4)
|
108 |
+
files_subtitles = gr.Files(label="Downloadable output file", scale=4)
|
109 |
+
btn_openfolder = gr.Button('📂', scale=1)
|
110 |
+
|
111 |
+
params = [tb_youtubelink, dd_model, dd_lang, dd_file_format, cb_translate, cb_timestamp]
|
112 |
+
advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
|
113 |
+
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
114 |
+
inputs=params + advanced_params,
|
115 |
+
outputs=[tb_indicator, files_subtitles])
|
116 |
+
tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
|
117 |
+
outputs=[img_thumbnail, tb_title, tb_description])
|
118 |
+
btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
|
119 |
+
dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
|
120 |
+
|
121 |
+
with gr.TabItem("Mic"): # tab3
|
122 |
+
with gr.Row():
|
123 |
+
mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
|
124 |
+
with gr.Row():
|
125 |
+
dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v3",
|
126 |
+
label="Model")
|
127 |
+
dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
|
128 |
+
value="Automatic Detection", label="Language")
|
129 |
+
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
130 |
+
with gr.Row():
|
131 |
+
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
132 |
+
with gr.Accordion("Advanced_Parameters", open=False):
|
133 |
+
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
134 |
+
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
135 |
+
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
136 |
+
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
|
137 |
+
with gr.Row():
|
138 |
+
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
139 |
+
with gr.Row():
|
140 |
+
tb_indicator = gr.Textbox(label="Output", scale=4)
|
141 |
+
files_subtitles = gr.Files(label="Downloadable output file", scale=4)
|
142 |
+
btn_openfolder = gr.Button('📂', scale=1)
|
143 |
+
|
144 |
+
params = [mic_input, dd_model, dd_lang, dd_file_format, cb_translate]
|
145 |
+
advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
|
146 |
+
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
147 |
+
inputs=params + advanced_params,
|
148 |
+
outputs=[tb_indicator, files_subtitles])
|
149 |
+
btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
|
150 |
+
dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
|
151 |
+
|
152 |
+
with gr.TabItem("T2T Translation"): # tab 4
|
153 |
+
with gr.Row():
|
154 |
+
file_subs = gr.Files(type="filepath", label="Upload Subtitle Files to translate here",
|
155 |
+
file_types=['.vtt', '.srt'])
|
156 |
+
|
157 |
+
with gr.TabItem("DeepL API"): # sub tab1
|
158 |
+
with gr.Row():
|
159 |
+
tb_authkey = gr.Textbox(label="Your Auth Key (API KEY)",
|
160 |
+
value="")
|
161 |
+
with gr.Row():
|
162 |
+
dd_deepl_sourcelang = gr.Dropdown(label="Source Language", value="Automatic Detection",
|
163 |
+
choices=list(
|
164 |
+
self.deepl_api.available_source_langs.keys()))
|
165 |
+
dd_deepl_targetlang = gr.Dropdown(label="Target Language", value="English",
|
166 |
+
choices=list(
|
167 |
+
self.deepl_api.available_target_langs.keys()))
|
168 |
+
with gr.Row():
|
169 |
+
cb_deepl_ispro = gr.Checkbox(label="Pro User?", value=False)
|
170 |
+
with gr.Row():
|
171 |
+
btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
|
172 |
+
with gr.Row():
|
173 |
+
tb_indicator = gr.Textbox(label="Output", scale=4)
|
174 |
+
files_subtitles = gr.Files(label="Downloadable output file", scale=4)
|
175 |
+
btn_openfolder = gr.Button('📂', scale=1)
|
176 |
+
|
177 |
+
btn_run.click(fn=self.deepl_api.translate_deepl,
|
178 |
+
inputs=[tb_authkey, file_subs, dd_deepl_sourcelang, dd_deepl_targetlang,
|
179 |
+
cb_deepl_ispro],
|
180 |
+
outputs=[tb_indicator, files_subtitles])
|
181 |
+
|
182 |
+
btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
|
183 |
+
inputs=None,
|
184 |
+
outputs=None)
|
185 |
+
|
186 |
+
with gr.TabItem("NLLB"): # sub tab2
|
187 |
+
with gr.Row():
|
188 |
+
dd_nllb_model = gr.Dropdown(label="Model", value=self.nllb_inf.default_model_size,
|
189 |
+
choices=self.nllb_inf.available_models)
|
190 |
+
dd_nllb_sourcelang = gr.Dropdown(label="Source Language",
|
191 |
+
choices=self.nllb_inf.available_source_langs)
|
192 |
+
dd_nllb_targetlang = gr.Dropdown(label="Target Language",
|
193 |
+
choices=self.nllb_inf.available_target_langs)
|
194 |
+
with gr.Row():
|
195 |
+
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
196 |
+
interactive=True)
|
197 |
+
with gr.Row():
|
198 |
+
btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
|
199 |
+
with gr.Row():
|
200 |
+
tb_indicator = gr.Textbox(label="Output", scale=4)
|
201 |
+
files_subtitles = gr.Files(label="Downloadable output file", scale=4)
|
202 |
+
btn_openfolder = gr.Button('📂', scale=1)
|
203 |
+
with gr.Column():
|
204 |
+
md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
|
205 |
+
|
206 |
+
btn_run.click(fn=self.nllb_inf.translate_file,
|
207 |
+
inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang, cb_timestamp],
|
208 |
+
outputs=[tb_indicator, files_subtitles])
|
209 |
+
|
210 |
+
btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
|
211 |
+
inputs=None,
|
212 |
+
outputs=None)
|
213 |
+
|
214 |
+
# Launch the app with optional gradio settings
|
215 |
+
launch_args = {}
|
216 |
+
if self.args.share:
|
217 |
+
launch_args['share'] = self.args.share
|
218 |
+
if self.args.server_name:
|
219 |
+
launch_args['server_name'] = self.args.server_name
|
220 |
+
if self.args.server_port:
|
221 |
+
launch_args['server_port'] = self.args.server_port
|
222 |
+
if self.args.username and self.args.password:
|
223 |
+
launch_args['auth'] = (self.args.username, self.args.password)
|
224 |
+
self.app.queue(api_open=False).launch(**launch_args)
|
225 |
+
|
226 |
+
|
227 |
+
# Create the parser for command-line arguments
|
228 |
+
parser = argparse.ArgumentParser()
|
229 |
+
parser.add_argument('--disable_faster_whisper', type=bool, default=False, nargs='?', const=True, help='Disable the faster_whisper implementation. faster_whipser is implemented by https://github.com/guillaumekln/faster-whisper')
|
230 |
+
parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
|
231 |
+
parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
|
232 |
+
parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
|
233 |
+
parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
|
234 |
+
parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
|
235 |
+
parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
|
236 |
+
parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
|
237 |
+
_args = parser.parse_args()
|
238 |
+
|
239 |
+
if __name__ == "__main__":
|
240 |
+
app = App(args=_args)
|
241 |
+
app.launch()
|
modules/__init__.py
ADDED
File without changes
|
modules/base_interface.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from typing import List
|
4 |
+
|
5 |
+
|
6 |
+
class BaseInterface:
|
7 |
+
def __init__(self):
|
8 |
+
pass
|
9 |
+
|
10 |
+
@staticmethod
|
11 |
+
def release_cuda_memory():
|
12 |
+
if torch.cuda.is_available():
|
13 |
+
torch.cuda.empty_cache()
|
14 |
+
torch.cuda.reset_max_memory_allocated()
|
15 |
+
|
16 |
+
@staticmethod
|
17 |
+
def remove_input_files(file_paths: List[str]):
|
18 |
+
for file_path in file_paths:
|
19 |
+
if file_path is None or not os.path.exists(file_path):
|
20 |
+
continue
|
21 |
+
os.remove(file_path)
|
modules/deepl_api.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
from datetime import datetime
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
from modules.subtitle_manager import *
|
8 |
+
|
9 |
+
"""
|
10 |
+
This is written with reference to the DeepL API documentation.
|
11 |
+
If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
|
12 |
+
"""
|
13 |
+
|
14 |
+
DEEPL_AVAILABLE_TARGET_LANGS = {
|
15 |
+
'Bulgarian': 'BG',
|
16 |
+
'Czech': 'CS',
|
17 |
+
'Danish': 'DA',
|
18 |
+
'German': 'DE',
|
19 |
+
'Greek': 'EL',
|
20 |
+
'English': 'EN',
|
21 |
+
'English (British)': 'EN-GB',
|
22 |
+
'English (American)': 'EN-US',
|
23 |
+
'Spanish': 'ES',
|
24 |
+
'Estonian': 'ET',
|
25 |
+
'Finnish': 'FI',
|
26 |
+
'French': 'FR',
|
27 |
+
'Hungarian': 'HU',
|
28 |
+
'Indonesian': 'ID',
|
29 |
+
'Italian': 'IT',
|
30 |
+
'Japanese': 'JA',
|
31 |
+
'Korean': 'KO',
|
32 |
+
'Lithuanian': 'LT',
|
33 |
+
'Latvian': 'LV',
|
34 |
+
'Norwegian (Bokmål)': 'NB',
|
35 |
+
'Dutch': 'NL',
|
36 |
+
'Polish': 'PL',
|
37 |
+
'Portuguese': 'PT',
|
38 |
+
'Portuguese (Brazilian)': 'PT-BR',
|
39 |
+
'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
|
40 |
+
'Romanian': 'RO',
|
41 |
+
'Russian': 'RU',
|
42 |
+
'Slovak': 'SK',
|
43 |
+
'Slovenian': 'SL',
|
44 |
+
'Swedish': 'SV',
|
45 |
+
'Turkish': 'TR',
|
46 |
+
'Ukrainian': 'UK',
|
47 |
+
'Chinese (simplified)': 'ZH'
|
48 |
+
}
|
49 |
+
|
50 |
+
DEEPL_AVAILABLE_SOURCE_LANGS = {
|
51 |
+
'Automatic Detection': None,
|
52 |
+
'Bulgarian': 'BG',
|
53 |
+
'Czech': 'CS',
|
54 |
+
'Danish': 'DA',
|
55 |
+
'German': 'DE',
|
56 |
+
'Greek': 'EL',
|
57 |
+
'English': 'EN',
|
58 |
+
'Spanish': 'ES',
|
59 |
+
'Estonian': 'ET',
|
60 |
+
'Finnish': 'FI',
|
61 |
+
'French': 'FR',
|
62 |
+
'Hungarian': 'HU',
|
63 |
+
'Indonesian': 'ID',
|
64 |
+
'Italian': 'IT',
|
65 |
+
'Japanese': 'JA',
|
66 |
+
'Korean': 'KO',
|
67 |
+
'Lithuanian': 'LT',
|
68 |
+
'Latvian': 'LV',
|
69 |
+
'Norwegian (Bokmål)': 'NB',
|
70 |
+
'Dutch': 'NL',
|
71 |
+
'Polish': 'PL',
|
72 |
+
'Portuguese (all Portuguese varieties mixed)': 'PT',
|
73 |
+
'Romanian': 'RO',
|
74 |
+
'Russian': 'RU',
|
75 |
+
'Slovak': 'SK',
|
76 |
+
'Slovenian': 'SL',
|
77 |
+
'Swedish': 'SV',
|
78 |
+
'Turkish': 'TR',
|
79 |
+
'Ukrainian': 'UK',
|
80 |
+
'Chinese': 'ZH'
|
81 |
+
}
|
82 |
+
|
83 |
+
|
84 |
+
class DeepLAPI:
|
85 |
+
def __init__(self):
|
86 |
+
self.api_interval = 1
|
87 |
+
self.max_text_batch_size = 50
|
88 |
+
self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
|
89 |
+
self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
|
90 |
+
|
91 |
+
def translate_deepl(self,
|
92 |
+
auth_key: str,
|
93 |
+
fileobjs: list,
|
94 |
+
source_lang: str,
|
95 |
+
target_lang: str,
|
96 |
+
is_pro: bool,
|
97 |
+
progress=gr.Progress()) -> list:
|
98 |
+
"""
|
99 |
+
Translate subtitle files using DeepL API
|
100 |
+
Parameters
|
101 |
+
----------
|
102 |
+
auth_key: str
|
103 |
+
API Key for DeepL from gr.Textbox()
|
104 |
+
fileobjs: list
|
105 |
+
List of files to transcribe from gr.Files()
|
106 |
+
source_lang: str
|
107 |
+
Source language of the file to transcribe from gr.Dropdown()
|
108 |
+
target_lang: str
|
109 |
+
Target language of the file to transcribe from gr.Dropdown()
|
110 |
+
is_pro: str
|
111 |
+
Boolean value that is about pro user or not from gr.Checkbox().
|
112 |
+
progress: gr.Progress
|
113 |
+
Indicator to show progress directly in gradio.
|
114 |
+
Returns
|
115 |
+
----------
|
116 |
+
A List of
|
117 |
+
String to return to gr.Textbox()
|
118 |
+
Files to return to gr.Files()
|
119 |
+
"""
|
120 |
+
|
121 |
+
files_info = {}
|
122 |
+
for fileobj in fileobjs:
|
123 |
+
file_path = fileobj.name
|
124 |
+
file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
|
125 |
+
|
126 |
+
if file_ext == ".srt":
|
127 |
+
parsed_dicts = parse_srt(file_path=file_path)
|
128 |
+
|
129 |
+
batch_size = self.max_text_batch_size
|
130 |
+
for batch_start in range(0, len(parsed_dicts), batch_size):
|
131 |
+
batch_end = min(batch_start + batch_size, len(parsed_dicts))
|
132 |
+
sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
|
133 |
+
translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
|
134 |
+
target_lang, is_pro)
|
135 |
+
for i, translated_text in enumerate(translated_texts):
|
136 |
+
parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
|
137 |
+
progress(batch_end / len(parsed_dicts), desc="Translating..")
|
138 |
+
|
139 |
+
subtitle = get_serialized_srt(parsed_dicts)
|
140 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
141 |
+
|
142 |
+
file_name = file_name[:-9]
|
143 |
+
output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}.srt")
|
144 |
+
write_file(subtitle, output_path)
|
145 |
+
|
146 |
+
elif file_ext == ".vtt":
|
147 |
+
parsed_dicts = parse_vtt(file_path=file_path)
|
148 |
+
|
149 |
+
batch_size = self.max_text_batch_size
|
150 |
+
for batch_start in range(0, len(parsed_dicts), batch_size):
|
151 |
+
batch_end = min(batch_start + batch_size, len(parsed_dicts))
|
152 |
+
sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
|
153 |
+
translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
|
154 |
+
target_lang, is_pro)
|
155 |
+
for i, translated_text in enumerate(translated_texts):
|
156 |
+
parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
|
157 |
+
progress(batch_end / len(parsed_dicts), desc="Translating..")
|
158 |
+
|
159 |
+
subtitle = get_serialized_vtt(parsed_dicts)
|
160 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
161 |
+
|
162 |
+
file_name = file_name[:-9]
|
163 |
+
output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}.vtt")
|
164 |
+
|
165 |
+
write_file(subtitle, output_path)
|
166 |
+
|
167 |
+
files_info[file_name] = subtitle
|
168 |
+
total_result = ''
|
169 |
+
for file_name, subtitle in files_info.items():
|
170 |
+
total_result += '------------------------------------\n'
|
171 |
+
total_result += f'{file_name}\n\n'
|
172 |
+
total_result += f'{subtitle}'
|
173 |
+
|
174 |
+
gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
|
175 |
+
return [gr_str, output_path]
|
176 |
+
|
177 |
+
def request_deepl_translate(self,
|
178 |
+
auth_key: str,
|
179 |
+
text: list,
|
180 |
+
source_lang: str,
|
181 |
+
target_lang: str,
|
182 |
+
is_pro: bool):
|
183 |
+
"""Request API response to DeepL server"""
|
184 |
+
|
185 |
+
url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
|
186 |
+
headers = {
|
187 |
+
'Authorization': f'DeepL-Auth-Key {auth_key}'
|
188 |
+
}
|
189 |
+
data = {
|
190 |
+
'text': text,
|
191 |
+
'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
|
192 |
+
'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
|
193 |
+
}
|
194 |
+
response = requests.post(url, headers=headers, data=data).json()
|
195 |
+
time.sleep(self.api_interval)
|
196 |
+
return response["translations"]
|
modules/faster_whisper_inference.py
ADDED
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import tqdm
|
4 |
+
import time
|
5 |
+
import numpy as np
|
6 |
+
from typing import BinaryIO, Union, Tuple
|
7 |
+
from datetime import datetime, timedelta
|
8 |
+
|
9 |
+
import faster_whisper
|
10 |
+
import ctranslate2
|
11 |
+
import whisper
|
12 |
+
import torch
|
13 |
+
import gradio as gr
|
14 |
+
|
15 |
+
from .base_interface import BaseInterface
|
16 |
+
from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
|
17 |
+
from modules.youtube_manager import get_ytdata, get_ytaudio
|
18 |
+
|
19 |
+
|
20 |
+
class FasterWhisperInference(BaseInterface):
|
21 |
+
def __init__(self):
|
22 |
+
super().__init__()
|
23 |
+
self.current_model_size = None
|
24 |
+
self.model = None
|
25 |
+
self.available_models = whisper.available_models()
|
26 |
+
self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
|
27 |
+
self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
|
28 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
29 |
+
self.available_compute_types = ctranslate2.get_supported_compute_types("cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
|
30 |
+
self.current_compute_type = "float16" if self.device == "cuda" else "float32"
|
31 |
+
self.default_beam_size = 1
|
32 |
+
|
33 |
+
def transcribe_file(self,
|
34 |
+
fileobjs: list,
|
35 |
+
model_size: str,
|
36 |
+
lang: str,
|
37 |
+
file_format: str,
|
38 |
+
istranslate: bool,
|
39 |
+
add_timestamp: bool,
|
40 |
+
beam_size: int,
|
41 |
+
log_prob_threshold: float,
|
42 |
+
no_speech_threshold: float,
|
43 |
+
compute_type: str,
|
44 |
+
progress=gr.Progress()
|
45 |
+
) -> list:
|
46 |
+
"""
|
47 |
+
Write subtitle file from Files
|
48 |
+
|
49 |
+
Parameters
|
50 |
+
----------
|
51 |
+
fileobjs: list
|
52 |
+
List of files to transcribe from gr.Files()
|
53 |
+
model_size: str
|
54 |
+
Whisper model size from gr.Dropdown()
|
55 |
+
lang: str
|
56 |
+
Source language of the file to transcribe from gr.Dropdown()
|
57 |
+
file_format: str
|
58 |
+
File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
|
59 |
+
istranslate: bool
|
60 |
+
Boolean value from gr.Checkbox() that determines whether to translate to English.
|
61 |
+
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
62 |
+
add_timestamp: bool
|
63 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
64 |
+
beam_size: int
|
65 |
+
Int value from gr.Number() that is used for decoding option.
|
66 |
+
log_prob_threshold: float
|
67 |
+
float value from gr.Number(). If the average log probability over sampled tokens is
|
68 |
+
below this value, treat as failed.
|
69 |
+
no_speech_threshold: float
|
70 |
+
float value from gr.Number(). If the no_speech probability is higher than this value AND
|
71 |
+
the average log probability over sampled tokens is below `log_prob_threshold`,
|
72 |
+
consider the segment as silent.
|
73 |
+
compute_type: str
|
74 |
+
compute type from gr.Dropdown().
|
75 |
+
see more info : https://opennmt.net/CTranslate2/quantization.html
|
76 |
+
progress: gr.Progress
|
77 |
+
Indicator to show progress directly in gradio.
|
78 |
+
|
79 |
+
Returns
|
80 |
+
----------
|
81 |
+
A List of
|
82 |
+
String to return to gr.Textbox()
|
83 |
+
Files to return to gr.Files()
|
84 |
+
"""
|
85 |
+
try:
|
86 |
+
self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
|
87 |
+
|
88 |
+
files_info = {}
|
89 |
+
for fileobj in fileobjs:
|
90 |
+
transcribed_segments, time_for_task = self.transcribe(
|
91 |
+
audio=fileobj.name,
|
92 |
+
lang=lang,
|
93 |
+
istranslate=istranslate,
|
94 |
+
beam_size=beam_size,
|
95 |
+
log_prob_threshold=log_prob_threshold,
|
96 |
+
no_speech_threshold=no_speech_threshold,
|
97 |
+
progress=progress
|
98 |
+
)
|
99 |
+
|
100 |
+
file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
|
101 |
+
file_name = safe_filename(file_name)
|
102 |
+
subtitle, file_path = self.generate_and_write_file(
|
103 |
+
file_name=file_name,
|
104 |
+
transcribed_segments=transcribed_segments,
|
105 |
+
add_timestamp=add_timestamp,
|
106 |
+
file_format=file_format
|
107 |
+
)
|
108 |
+
files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
|
109 |
+
|
110 |
+
total_result = ''
|
111 |
+
total_time = 0
|
112 |
+
for file_name, info in files_info.items():
|
113 |
+
total_result += '------------------------------------\n'
|
114 |
+
total_result += f'{file_name}\n\n'
|
115 |
+
total_result += f'{info["subtitle"]}'
|
116 |
+
total_time += info["time_for_task"]
|
117 |
+
|
118 |
+
gr_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
|
119 |
+
gr_file_path = [info['path'] for info in files_info.values()]
|
120 |
+
|
121 |
+
return [gr_str, gr_file_path]
|
122 |
+
|
123 |
+
except Exception as e:
|
124 |
+
print(f"Error transcribing file on line {e}")
|
125 |
+
finally:
|
126 |
+
self.release_cuda_memory()
|
127 |
+
self.remove_input_files([fileobj.name for fileobj in fileobjs])
|
128 |
+
|
129 |
+
def transcribe_youtube(self,
|
130 |
+
youtubelink: str,
|
131 |
+
model_size: str,
|
132 |
+
lang: str,
|
133 |
+
file_format: str,
|
134 |
+
istranslate: bool,
|
135 |
+
add_timestamp: bool,
|
136 |
+
beam_size: int,
|
137 |
+
log_prob_threshold: float,
|
138 |
+
no_speech_threshold: float,
|
139 |
+
compute_type: str,
|
140 |
+
progress=gr.Progress()
|
141 |
+
) -> list:
|
142 |
+
"""
|
143 |
+
Write subtitle file from Youtube
|
144 |
+
|
145 |
+
Parameters
|
146 |
+
----------
|
147 |
+
youtubelink: str
|
148 |
+
Link of Youtube to transcribe from gr.Textbox()
|
149 |
+
model_size: str
|
150 |
+
Whisper model size from gr.Dropdown()
|
151 |
+
lang: str
|
152 |
+
Source language of the file to transcribe from gr.Dropdown()
|
153 |
+
file_format: str
|
154 |
+
File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
|
155 |
+
istranslate: bool
|
156 |
+
Boolean value from gr.Checkbox() that determines whether to translate to English.
|
157 |
+
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
158 |
+
add_timestamp: bool
|
159 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
160 |
+
beam_size: int
|
161 |
+
Int value from gr.Number() that is used for decoding option.
|
162 |
+
log_prob_threshold: float
|
163 |
+
float value from gr.Number(). If the average log probability over sampled tokens is
|
164 |
+
below this value, treat as failed.
|
165 |
+
no_speech_threshold: float
|
166 |
+
float value from gr.Number(). If the no_speech probability is higher than this value AND
|
167 |
+
the average log probability over sampled tokens is below `log_prob_threshold`,
|
168 |
+
consider the segment as silent.
|
169 |
+
compute_type: str
|
170 |
+
compute type from gr.Dropdown().
|
171 |
+
see more info : https://opennmt.net/CTranslate2/quantization.html
|
172 |
+
progress: gr.Progress
|
173 |
+
Indicator to show progress directly in gradio.
|
174 |
+
|
175 |
+
Returns
|
176 |
+
----------
|
177 |
+
A List of
|
178 |
+
String to return to gr.Textbox()
|
179 |
+
Files to return to gr.Files()
|
180 |
+
"""
|
181 |
+
try:
|
182 |
+
self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
|
183 |
+
|
184 |
+
progress(0, desc="Loading Audio from Youtube..")
|
185 |
+
yt = get_ytdata(youtubelink)
|
186 |
+
audio = get_ytaudio(yt)
|
187 |
+
|
188 |
+
transcribed_segments, time_for_task = self.transcribe(
|
189 |
+
audio=audio,
|
190 |
+
lang=lang,
|
191 |
+
istranslate=istranslate,
|
192 |
+
beam_size=beam_size,
|
193 |
+
log_prob_threshold=log_prob_threshold,
|
194 |
+
no_speech_threshold=no_speech_threshold,
|
195 |
+
progress=progress
|
196 |
+
)
|
197 |
+
|
198 |
+
progress(1, desc="Completed!")
|
199 |
+
|
200 |
+
file_name = safe_filename(yt.title)
|
201 |
+
subtitle, file_path = self.generate_and_write_file(
|
202 |
+
file_name=file_name,
|
203 |
+
transcribed_segments=transcribed_segments,
|
204 |
+
add_timestamp=add_timestamp,
|
205 |
+
file_format=file_format
|
206 |
+
)
|
207 |
+
gr_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
208 |
+
|
209 |
+
return [gr_str, file_path]
|
210 |
+
|
211 |
+
except Exception as e:
|
212 |
+
print(f"Error transcribing file on line {e}")
|
213 |
+
finally:
|
214 |
+
try:
|
215 |
+
if 'yt' not in locals():
|
216 |
+
yt = get_ytdata(youtubelink)
|
217 |
+
file_path = get_ytaudio(yt)
|
218 |
+
else:
|
219 |
+
file_path = get_ytaudio(yt)
|
220 |
+
|
221 |
+
self.release_cuda_memory()
|
222 |
+
self.remove_input_files([file_path])
|
223 |
+
except Exception as cleanup_error:
|
224 |
+
pass
|
225 |
+
|
226 |
+
def transcribe_mic(self,
|
227 |
+
micaudio: str,
|
228 |
+
model_size: str,
|
229 |
+
lang: str,
|
230 |
+
file_format: str,
|
231 |
+
istranslate: bool,
|
232 |
+
beam_size: int,
|
233 |
+
log_prob_threshold: float,
|
234 |
+
no_speech_threshold: float,
|
235 |
+
compute_type: str,
|
236 |
+
progress=gr.Progress()
|
237 |
+
) -> list:
|
238 |
+
"""
|
239 |
+
Write subtitle file from microphone
|
240 |
+
|
241 |
+
Parameters
|
242 |
+
----------
|
243 |
+
micaudio: str
|
244 |
+
Audio file path from gr.Microphone()
|
245 |
+
model_size: str
|
246 |
+
Whisper model size from gr.Dropdown()
|
247 |
+
lang: str
|
248 |
+
Source language of the file to transcribe from gr.Dropdown()
|
249 |
+
file_format: str
|
250 |
+
File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
|
251 |
+
istranslate: bool
|
252 |
+
Boolean value from gr.Checkbox() that determines whether to translate to English.
|
253 |
+
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
254 |
+
beam_size: int
|
255 |
+
Int value from gr.Number() that is used for decoding option.
|
256 |
+
log_prob_threshold: float
|
257 |
+
float value from gr.Number(). If the average log probability over sampled tokens is
|
258 |
+
below this value, treat as failed.
|
259 |
+
no_speech_threshold: float
|
260 |
+
float value from gr.Number(). If the no_speech probability is higher than this value AND
|
261 |
+
the average log probability over sampled tokens is below `log_prob_threshold`,
|
262 |
+
compute_type: str
|
263 |
+
compute type from gr.Dropdown().
|
264 |
+
see more info : https://opennmt.net/CTranslate2/quantization.html
|
265 |
+
consider the segment as silent.
|
266 |
+
progress: gr.Progress
|
267 |
+
Indicator to show progress directly in gradio.
|
268 |
+
|
269 |
+
Returns
|
270 |
+
----------
|
271 |
+
A List of
|
272 |
+
String to return to gr.Textbox()
|
273 |
+
Files to return to gr.Files()
|
274 |
+
"""
|
275 |
+
try:
|
276 |
+
self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
|
277 |
+
|
278 |
+
progress(0, desc="Loading Audio..")
|
279 |
+
|
280 |
+
transcribed_segments, time_for_task = self.transcribe(
|
281 |
+
audio=micaudio,
|
282 |
+
lang=lang,
|
283 |
+
istranslate=istranslate,
|
284 |
+
beam_size=beam_size,
|
285 |
+
log_prob_threshold=log_prob_threshold,
|
286 |
+
no_speech_threshold=no_speech_threshold,
|
287 |
+
progress=progress
|
288 |
+
)
|
289 |
+
progress(1, desc="Completed!")
|
290 |
+
|
291 |
+
subtitle, file_path = self.generate_and_write_file(
|
292 |
+
file_name="Mic",
|
293 |
+
transcribed_segments=transcribed_segments,
|
294 |
+
add_timestamp=True,
|
295 |
+
file_format=file_format
|
296 |
+
)
|
297 |
+
|
298 |
+
gr_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
299 |
+
return [gr_str, file_path]
|
300 |
+
except Exception as e:
|
301 |
+
print(f"Error transcribing file on line {e}")
|
302 |
+
finally:
|
303 |
+
self.release_cuda_memory()
|
304 |
+
self.remove_input_files([micaudio])
|
305 |
+
|
306 |
+
def transcribe(self,
|
307 |
+
audio: Union[str, BinaryIO, np.ndarray],
|
308 |
+
lang: str,
|
309 |
+
istranslate: bool,
|
310 |
+
beam_size: int,
|
311 |
+
log_prob_threshold: float,
|
312 |
+
no_speech_threshold: float,
|
313 |
+
progress: gr.Progress
|
314 |
+
) -> Tuple[list, float]:
|
315 |
+
"""
|
316 |
+
transcribe method for faster-whisper.
|
317 |
+
|
318 |
+
Parameters
|
319 |
+
----------
|
320 |
+
audio: Union[str, BinaryIO, np.ndarray]
|
321 |
+
Audio path or file binary or Audio numpy array
|
322 |
+
lang: str
|
323 |
+
Source language of the file to transcribe from gr.Dropdown()
|
324 |
+
istranslate: bool
|
325 |
+
Boolean value from gr.Checkbox() that determines whether to translate to English.
|
326 |
+
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
327 |
+
beam_size: int
|
328 |
+
Int value from gr.Number() that is used for decoding option.
|
329 |
+
log_prob_threshold: float
|
330 |
+
float value from gr.Number(). If the average log probability over sampled tokens is
|
331 |
+
below this value, treat as failed.
|
332 |
+
no_speech_threshold: float
|
333 |
+
float value from gr.Number(). If the no_speech probability is higher than this value AND
|
334 |
+
the average log probability over sampled tokens is below `log_prob_threshold`,
|
335 |
+
consider the segment as silent.
|
336 |
+
progress: gr.Progress
|
337 |
+
Indicator to show progress directly in gradio.
|
338 |
+
|
339 |
+
Returns
|
340 |
+
----------
|
341 |
+
segments_result: list[dict]
|
342 |
+
list of dicts that includes start, end timestamps and transcribed text
|
343 |
+
elapsed_time: float
|
344 |
+
elapsed time for transcription
|
345 |
+
"""
|
346 |
+
start_time = time.time()
|
347 |
+
|
348 |
+
if lang == "Automatic Detection":
|
349 |
+
lang = None
|
350 |
+
else:
|
351 |
+
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
352 |
+
lang = language_code_dict[lang]
|
353 |
+
segments, info = self.model.transcribe(
|
354 |
+
audio=audio,
|
355 |
+
language=lang,
|
356 |
+
task="translate" if istranslate and self.current_model_size in self.translatable_models else "transcribe",
|
357 |
+
beam_size=beam_size,
|
358 |
+
log_prob_threshold=log_prob_threshold,
|
359 |
+
no_speech_threshold=no_speech_threshold,
|
360 |
+
)
|
361 |
+
progress(0, desc="Loading audio..")
|
362 |
+
|
363 |
+
segments_result = []
|
364 |
+
for segment in segments:
|
365 |
+
progress(segment.start / info.duration, desc="Transcribing..")
|
366 |
+
segments_result.append({
|
367 |
+
"start": segment.start,
|
368 |
+
"end": segment.end,
|
369 |
+
"text": segment.text
|
370 |
+
})
|
371 |
+
|
372 |
+
elapsed_time = time.time() - start_time
|
373 |
+
return segments_result, elapsed_time
|
374 |
+
|
375 |
+
def update_model_if_needed(self,
|
376 |
+
model_size: str,
|
377 |
+
compute_type: str,
|
378 |
+
progress: gr.Progress
|
379 |
+
):
|
380 |
+
"""
|
381 |
+
Initialize model if it doesn't match with current model setting
|
382 |
+
"""
|
383 |
+
if model_size != self.current_model_size or self.model is None or self.current_compute_type != compute_type:
|
384 |
+
progress(0, desc="Initializing Model..")
|
385 |
+
self.current_model_size = model_size
|
386 |
+
self.current_compute_type = compute_type
|
387 |
+
self.model = faster_whisper.WhisperModel(
|
388 |
+
device=self.device,
|
389 |
+
model_size_or_path=model_size,
|
390 |
+
download_root=os.path.join("models", "Whisper", "faster-whisper"),
|
391 |
+
compute_type=self.current_compute_type
|
392 |
+
)
|
393 |
+
|
394 |
+
@staticmethod
|
395 |
+
def generate_and_write_file(file_name: str,
|
396 |
+
transcribed_segments: list,
|
397 |
+
add_timestamp: bool,
|
398 |
+
file_format: str,
|
399 |
+
) -> str:
|
400 |
+
"""
|
401 |
+
This method writes subtitle file and returns str to gr.Textbox
|
402 |
+
"""
|
403 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
404 |
+
if add_timestamp:
|
405 |
+
output_path = os.path.join("outputs", f"{file_name}-{timestamp}")
|
406 |
+
else:
|
407 |
+
output_path = os.path.join("outputs", f"{file_name}")
|
408 |
+
|
409 |
+
if file_format == "SRT":
|
410 |
+
content = get_srt(transcribed_segments)
|
411 |
+
output_path += '.srt'
|
412 |
+
write_file(content, output_path)
|
413 |
+
|
414 |
+
elif file_format == "WebVTT":
|
415 |
+
content = get_vtt(transcribed_segments)
|
416 |
+
output_path += '.vtt'
|
417 |
+
write_file(content, output_path)
|
418 |
+
|
419 |
+
elif file_format == "txt":
|
420 |
+
content = get_txt(transcribed_segments)
|
421 |
+
output_path += '.txt'
|
422 |
+
write_file(content, output_path)
|
423 |
+
return content, output_path
|
424 |
+
|
425 |
+
@staticmethod
|
426 |
+
def format_time(elapsed_time: float) -> str:
|
427 |
+
hours, rem = divmod(elapsed_time, 3600)
|
428 |
+
minutes, seconds = divmod(rem, 60)
|
429 |
+
|
430 |
+
time_str = ""
|
431 |
+
if hours:
|
432 |
+
time_str += f"{hours} hours "
|
433 |
+
if minutes:
|
434 |
+
time_str += f"{minutes} minutes "
|
435 |
+
seconds = round(seconds)
|
436 |
+
time_str += f"{seconds} seconds"
|
437 |
+
|
438 |
+
return time_str.strip()
|
modules/nllb_inference.py
ADDED
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
2 |
+
import gradio as gr
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
from datetime import datetime
|
6 |
+
|
7 |
+
from .base_interface import BaseInterface
|
8 |
+
from modules.subtitle_manager import *
|
9 |
+
|
10 |
+
DEFAULT_MODEL_SIZE = "facebook/nllb-200-1.3B"
|
11 |
+
NLLB_MODELS = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
|
12 |
+
|
13 |
+
|
14 |
+
class NLLBInference(BaseInterface):
|
15 |
+
def __init__(self):
|
16 |
+
super().__init__()
|
17 |
+
self.default_model_size = DEFAULT_MODEL_SIZE
|
18 |
+
self.current_model_size = None
|
19 |
+
self.model = None
|
20 |
+
self.tokenizer = None
|
21 |
+
self.available_models = NLLB_MODELS
|
22 |
+
self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
|
23 |
+
self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
|
24 |
+
self.device = 0 if torch.cuda.is_available() else -1
|
25 |
+
self.pipeline = None
|
26 |
+
|
27 |
+
def translate_text(self, text):
|
28 |
+
result = self.pipeline(text)
|
29 |
+
return result[0]['translation_text']
|
30 |
+
|
31 |
+
def translate_file(self,
|
32 |
+
fileobjs: list,
|
33 |
+
model_size: str,
|
34 |
+
src_lang: str,
|
35 |
+
tgt_lang: str,
|
36 |
+
add_timestamp: bool,
|
37 |
+
progress=gr.Progress()) -> list:
|
38 |
+
"""
|
39 |
+
Translate subtitle file from source language to target language
|
40 |
+
|
41 |
+
Parameters
|
42 |
+
----------
|
43 |
+
fileobjs: list
|
44 |
+
List of files to transcribe from gr.Files()
|
45 |
+
model_size: str
|
46 |
+
Whisper model size from gr.Dropdown()
|
47 |
+
src_lang: str
|
48 |
+
Source language of the file to translate from gr.Dropdown()
|
49 |
+
tgt_lang: str
|
50 |
+
Target language of the file to translate from gr.Dropdown()
|
51 |
+
add_timestamp: bool
|
52 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
53 |
+
progress: gr.Progress
|
54 |
+
Indicator to show progress directly in gradio.
|
55 |
+
I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
|
56 |
+
|
57 |
+
Returns
|
58 |
+
----------
|
59 |
+
A List of
|
60 |
+
String to return to gr.Textbox()
|
61 |
+
Files to return to gr.Files()
|
62 |
+
"""
|
63 |
+
try:
|
64 |
+
if model_size != self.current_model_size or self.model is None:
|
65 |
+
print("\nInitializing NLLB Model..\n")
|
66 |
+
progress(0, desc="Initializing NLLB Model..")
|
67 |
+
self.current_model_size = model_size
|
68 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
|
69 |
+
cache_dir=os.path.join("models", "NLLB"))
|
70 |
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
|
71 |
+
cache_dir=os.path.join("models", "NLLB", "tokenizers"))
|
72 |
+
|
73 |
+
src_lang = NLLB_AVAILABLE_LANGS[src_lang]
|
74 |
+
tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
|
75 |
+
|
76 |
+
self.pipeline = pipeline("translation",
|
77 |
+
model=self.model,
|
78 |
+
tokenizer=self.tokenizer,
|
79 |
+
src_lang=src_lang,
|
80 |
+
tgt_lang=tgt_lang,
|
81 |
+
device=self.device)
|
82 |
+
|
83 |
+
files_info = {}
|
84 |
+
for fileobj in fileobjs:
|
85 |
+
file_path = fileobj.name
|
86 |
+
file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
|
87 |
+
if file_ext == ".srt":
|
88 |
+
parsed_dicts = parse_srt(file_path=file_path)
|
89 |
+
total_progress = len(parsed_dicts)
|
90 |
+
for index, dic in enumerate(parsed_dicts):
|
91 |
+
progress(index / total_progress, desc="Translating..")
|
92 |
+
translated_text = self.translate_text(dic["sentence"])
|
93 |
+
dic["sentence"] = translated_text
|
94 |
+
subtitle = get_serialized_srt(parsed_dicts)
|
95 |
+
|
96 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
97 |
+
if add_timestamp:
|
98 |
+
output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
|
99 |
+
else:
|
100 |
+
output_path = os.path.join("outputs", "translations", f"{file_name}")
|
101 |
+
output_path += '.srt'
|
102 |
+
|
103 |
+
write_file(subtitle, output_path)
|
104 |
+
|
105 |
+
elif file_ext == ".vtt":
|
106 |
+
parsed_dicts = parse_vtt(file_path=file_path)
|
107 |
+
total_progress = len(parsed_dicts)
|
108 |
+
for index, dic in enumerate(parsed_dicts):
|
109 |
+
progress(index / total_progress, desc="Translating..")
|
110 |
+
translated_text = self.translate_text(dic["sentence"])
|
111 |
+
dic["sentence"] = translated_text
|
112 |
+
subtitle = get_serialized_vtt(parsed_dicts)
|
113 |
+
|
114 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
115 |
+
if add_timestamp:
|
116 |
+
output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
|
117 |
+
else:
|
118 |
+
output_path = os.path.join("outputs", "translations", f"{file_name}")
|
119 |
+
output_path += '.vtt'
|
120 |
+
|
121 |
+
write_file(subtitle, output_path)
|
122 |
+
|
123 |
+
files_info[file_name] = subtitle
|
124 |
+
|
125 |
+
total_result = ''
|
126 |
+
for file_name, subtitle in files_info.items():
|
127 |
+
total_result += '------------------------------------\n'
|
128 |
+
total_result += f'{file_name}\n\n'
|
129 |
+
total_result += f'{subtitle}'
|
130 |
+
|
131 |
+
gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
|
132 |
+
return [gr_str, output_path]
|
133 |
+
except Exception as e:
|
134 |
+
print(f"Error: {str(e)}")
|
135 |
+
finally:
|
136 |
+
self.release_cuda_memory()
|
137 |
+
self.remove_input_files([fileobj.name for fileobj in fileobjs])
|
138 |
+
|
139 |
+
|
140 |
+
NLLB_AVAILABLE_LANGS = {
|
141 |
+
"Acehnese (Arabic script)": "ace_Arab",
|
142 |
+
"Acehnese (Latin script)": "ace_Latn",
|
143 |
+
"Mesopotamian Arabic": "acm_Arab",
|
144 |
+
"Ta’izzi-Adeni Arabic": "acq_Arab",
|
145 |
+
"Tunisian Arabic": "aeb_Arab",
|
146 |
+
"Afrikaans": "afr_Latn",
|
147 |
+
"South Levantine Arabic": "ajp_Arab",
|
148 |
+
"Akan": "aka_Latn",
|
149 |
+
"Amharic": "amh_Ethi",
|
150 |
+
"North Levantine Arabic": "apc_Arab",
|
151 |
+
"Modern Standard Arabic": "arb_Arab",
|
152 |
+
"Modern Standard Arabic (Romanized)": "arb_Latn",
|
153 |
+
"Najdi Arabic": "ars_Arab",
|
154 |
+
"Moroccan Arabic": "ary_Arab",
|
155 |
+
"Egyptian Arabic": "arz_Arab",
|
156 |
+
"Assamese": "asm_Beng",
|
157 |
+
"Asturian": "ast_Latn",
|
158 |
+
"Awadhi": "awa_Deva",
|
159 |
+
"Central Aymara": "ayr_Latn",
|
160 |
+
"South Azerbaijani": "azb_Arab",
|
161 |
+
"North Azerbaijani": "azj_Latn",
|
162 |
+
"Bashkir": "bak_Cyrl",
|
163 |
+
"Bambara": "bam_Latn",
|
164 |
+
"Balinese": "ban_Latn",
|
165 |
+
"Belarusian": "bel_Cyrl",
|
166 |
+
"Bemba": "bem_Latn",
|
167 |
+
"Bengali": "ben_Beng",
|
168 |
+
"Bhojpuri": "bho_Deva",
|
169 |
+
"Banjar (Arabic script)": "bjn_Arab",
|
170 |
+
"Banjar (Latin script)": "bjn_Latn",
|
171 |
+
"Standard Tibetan": "bod_Tibt",
|
172 |
+
"Bosnian": "bos_Latn",
|
173 |
+
"Buginese": "bug_Latn",
|
174 |
+
"Bulgarian": "bul_Cyrl",
|
175 |
+
"Catalan": "cat_Latn",
|
176 |
+
"Cebuano": "ceb_Latn",
|
177 |
+
"Czech": "ces_Latn",
|
178 |
+
"Chokwe": "cjk_Latn",
|
179 |
+
"Central Kurdish": "ckb_Arab",
|
180 |
+
"Crimean Tatar": "crh_Latn",
|
181 |
+
"Welsh": "cym_Latn",
|
182 |
+
"Danish": "dan_Latn",
|
183 |
+
"German": "deu_Latn",
|
184 |
+
"Southwestern Dinka": "dik_Latn",
|
185 |
+
"Dyula": "dyu_Latn",
|
186 |
+
"Dzongkha": "dzo_Tibt",
|
187 |
+
"Greek": "ell_Grek",
|
188 |
+
"English": "eng_Latn",
|
189 |
+
"Esperanto": "epo_Latn",
|
190 |
+
"Estonian": "est_Latn",
|
191 |
+
"Basque": "eus_Latn",
|
192 |
+
"Ewe": "ewe_Latn",
|
193 |
+
"Faroese": "fao_Latn",
|
194 |
+
"Fijian": "fij_Latn",
|
195 |
+
"Finnish": "fin_Latn",
|
196 |
+
"Fon": "fon_Latn",
|
197 |
+
"French": "fra_Latn",
|
198 |
+
"Friulian": "fur_Latn",
|
199 |
+
"Nigerian Fulfulde": "fuv_Latn",
|
200 |
+
"Scottish Gaelic": "gla_Latn",
|
201 |
+
"Irish": "gle_Latn",
|
202 |
+
"Galician": "glg_Latn",
|
203 |
+
"Guarani": "grn_Latn",
|
204 |
+
"Gujarati": "guj_Gujr",
|
205 |
+
"Haitian Creole": "hat_Latn",
|
206 |
+
"Hausa": "hau_Latn",
|
207 |
+
"Hebrew": "heb_Hebr",
|
208 |
+
"Hindi": "hin_Deva",
|
209 |
+
"Chhattisgarhi": "hne_Deva",
|
210 |
+
"Croatian": "hrv_Latn",
|
211 |
+
"Hungarian": "hun_Latn",
|
212 |
+
"Armenian": "hye_Armn",
|
213 |
+
"Igbo": "ibo_Latn",
|
214 |
+
"Ilocano": "ilo_Latn",
|
215 |
+
"Indonesian": "ind_Latn",
|
216 |
+
"Icelandic": "isl_Latn",
|
217 |
+
"Italian": "ita_Latn",
|
218 |
+
"Javanese": "jav_Latn",
|
219 |
+
"Japanese": "jpn_Jpan",
|
220 |
+
"Kabyle": "kab_Latn",
|
221 |
+
"Jingpho": "kac_Latn",
|
222 |
+
"Kamba": "kam_Latn",
|
223 |
+
"Kannada": "kan_Knda",
|
224 |
+
"Kashmiri (Arabic script)": "kas_Arab",
|
225 |
+
"Kashmiri (Devanagari script)": "kas_Deva",
|
226 |
+
"Georgian": "kat_Geor",
|
227 |
+
"Central Kanuri (Arabic script)": "knc_Arab",
|
228 |
+
"Central Kanuri (Latin script)": "knc_Latn",
|
229 |
+
"Kazakh": "kaz_Cyrl",
|
230 |
+
"Kabiyè": "kbp_Latn",
|
231 |
+
"Kabuverdianu": "kea_Latn",
|
232 |
+
"Khmer": "khm_Khmr",
|
233 |
+
"Kikuyu": "kik_Latn",
|
234 |
+
"Kinyarwanda": "kin_Latn",
|
235 |
+
"Kyrgyz": "kir_Cyrl",
|
236 |
+
"Kimbundu": "kmb_Latn",
|
237 |
+
"Northern Kurdish": "kmr_Latn",
|
238 |
+
"Kikongo": "kon_Latn",
|
239 |
+
"Korean": "kor_Hang",
|
240 |
+
"Lao": "lao_Laoo",
|
241 |
+
"Ligurian": "lij_Latn",
|
242 |
+
"Limburgish": "lim_Latn",
|
243 |
+
"Lingala": "lin_Latn",
|
244 |
+
"Lithuanian": "lit_Latn",
|
245 |
+
"Lombard": "lmo_Latn",
|
246 |
+
"Latgalian": "ltg_Latn",
|
247 |
+
"Luxembourgish": "ltz_Latn",
|
248 |
+
"Luba-Kasai": "lua_Latn",
|
249 |
+
"Ganda": "lug_Latn",
|
250 |
+
"Luo": "luo_Latn",
|
251 |
+
"Mizo": "lus_Latn",
|
252 |
+
"Standard Latvian": "lvs_Latn",
|
253 |
+
"Magahi": "mag_Deva",
|
254 |
+
"Maithili": "mai_Deva",
|
255 |
+
"Malayalam": "mal_Mlym",
|
256 |
+
"Marathi": "mar_Deva",
|
257 |
+
"Minangkabau (Arabic script)": "min_Arab",
|
258 |
+
"Minangkabau (Latin script)": "min_Latn",
|
259 |
+
"Macedonian": "mkd_Cyrl",
|
260 |
+
"Plateau Malagasy": "plt_Latn",
|
261 |
+
"Maltese": "mlt_Latn",
|
262 |
+
"Meitei (Bengali script)": "mni_Beng",
|
263 |
+
"Halh Mongolian": "khk_Cyrl",
|
264 |
+
"Mossi": "mos_Latn",
|
265 |
+
"Maori": "mri_Latn",
|
266 |
+
"Burmese": "mya_Mymr",
|
267 |
+
"Dutch": "nld_Latn",
|
268 |
+
"Norwegian Nynorsk": "nno_Latn",
|
269 |
+
"Norwegian Bokmål": "nob_Latn",
|
270 |
+
"Nepali": "npi_Deva",
|
271 |
+
"Northern Sotho": "nso_Latn",
|
272 |
+
"Nuer": "nus_Latn",
|
273 |
+
"Nyanja": "nya_Latn",
|
274 |
+
"Occitan": "oci_Latn",
|
275 |
+
"West Central Oromo": "gaz_Latn",
|
276 |
+
"Odia": "ory_Orya",
|
277 |
+
"Pangasinan": "pag_Latn",
|
278 |
+
"Eastern Panjabi": "pan_Guru",
|
279 |
+
"Papiamento": "pap_Latn",
|
280 |
+
"Western Persian": "pes_Arab",
|
281 |
+
"Polish": "pol_Latn",
|
282 |
+
"Portuguese": "por_Latn",
|
283 |
+
"Dari": "prs_Arab",
|
284 |
+
"Southern Pashto": "pbt_Arab",
|
285 |
+
"Ayacucho Quechua": "quy_Latn",
|
286 |
+
"Romanian": "ron_Latn",
|
287 |
+
"Rundi": "run_Latn",
|
288 |
+
"Russian": "rus_Cyrl",
|
289 |
+
"Sango": "sag_Latn",
|
290 |
+
"Sanskrit": "san_Deva",
|
291 |
+
"Santali": "sat_Olck",
|
292 |
+
"Sicilian": "scn_Latn",
|
293 |
+
"Shan": "shn_Mymr",
|
294 |
+
"Sinhala": "sin_Sinh",
|
295 |
+
"Slovak": "slk_Latn",
|
296 |
+
"Slovenian": "slv_Latn",
|
297 |
+
"Samoan": "smo_Latn",
|
298 |
+
"Shona": "sna_Latn",
|
299 |
+
"Sindhi": "snd_Arab",
|
300 |
+
"Somali": "som_Latn",
|
301 |
+
"Southern Sotho": "sot_Latn",
|
302 |
+
"Spanish": "spa_Latn",
|
303 |
+
"Tosk Albanian": "als_Latn",
|
304 |
+
"Sardinian": "srd_Latn",
|
305 |
+
"Serbian": "srp_Cyrl",
|
306 |
+
"Swati": "ssw_Latn",
|
307 |
+
"Sundanese": "sun_Latn",
|
308 |
+
"Swedish": "swe_Latn",
|
309 |
+
"Swahili": "swh_Latn",
|
310 |
+
"Silesian": "szl_Latn",
|
311 |
+
"Tamil": "tam_Taml",
|
312 |
+
"Tatar": "tat_Cyrl",
|
313 |
+
"Telugu": "tel_Telu",
|
314 |
+
"Tajik": "tgk_Cyrl",
|
315 |
+
"Tagalog": "tgl_Latn",
|
316 |
+
"Thai": "tha_Thai",
|
317 |
+
"Tigrinya": "tir_Ethi",
|
318 |
+
"Tamasheq (Latin script)": "taq_Latn",
|
319 |
+
"Tamasheq (Tifinagh script)": "taq_Tfng",
|
320 |
+
"Tok Pisin": "tpi_Latn",
|
321 |
+
"Tswana": "tsn_Latn",
|
322 |
+
"Tsonga": "tso_Latn",
|
323 |
+
"Turkmen": "tuk_Latn",
|
324 |
+
"Tumbuka": "tum_Latn",
|
325 |
+
"Turkish": "tur_Latn",
|
326 |
+
"Twi": "twi_Latn",
|
327 |
+
"Central Atlas Tamazight": "tzm_Tfng",
|
328 |
+
"Uyghur": "uig_Arab",
|
329 |
+
"Ukrainian": "ukr_Cyrl",
|
330 |
+
"Umbundu": "umb_Latn",
|
331 |
+
"Urdu": "urd_Arab",
|
332 |
+
"Northern Uzbek": "uzn_Latn",
|
333 |
+
"Venetian": "vec_Latn",
|
334 |
+
"Vietnamese": "vie_Latn",
|
335 |
+
"Waray": "war_Latn",
|
336 |
+
"Wolof": "wol_Latn",
|
337 |
+
"Xhosa": "xho_Latn",
|
338 |
+
"Eastern Yiddish": "ydd_Hebr",
|
339 |
+
"Yoruba": "yor_Latn",
|
340 |
+
"Yue Chinese": "yue_Hant",
|
341 |
+
"Chinese (Simplified)": "zho_Hans",
|
342 |
+
"Chinese (Traditional)": "zho_Hant",
|
343 |
+
"Standard Malay": "zsm_Latn",
|
344 |
+
"Zulu": "zul_Latn",
|
345 |
+
}
|
modules/subtitle_manager.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
def timeformat_srt(time):
|
5 |
+
hours = time // 3600
|
6 |
+
minutes = (time - hours * 3600) // 60
|
7 |
+
seconds = time - hours * 3600 - minutes * 60
|
8 |
+
milliseconds = (time - int(time)) * 1000
|
9 |
+
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
|
10 |
+
|
11 |
+
|
12 |
+
def timeformat_vtt(time):
|
13 |
+
hours = time // 3600
|
14 |
+
minutes = (time - hours * 3600) // 60
|
15 |
+
seconds = time - hours * 3600 - minutes * 60
|
16 |
+
milliseconds = (time - int(time)) * 1000
|
17 |
+
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
|
18 |
+
|
19 |
+
|
20 |
+
def write_file(subtitle, output_file):
|
21 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
22 |
+
f.write(subtitle)
|
23 |
+
|
24 |
+
|
25 |
+
def get_srt(segments):
|
26 |
+
output = ""
|
27 |
+
for i, segment in enumerate(segments):
|
28 |
+
output += f"{i + 1}\n"
|
29 |
+
output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
|
30 |
+
if segment['text'].startswith(' '):
|
31 |
+
segment['text'] = segment['text'][1:]
|
32 |
+
output += f"{segment['text']}\n\n"
|
33 |
+
return output
|
34 |
+
|
35 |
+
|
36 |
+
def get_vtt(segments):
|
37 |
+
output = "WebVTT\n\n"
|
38 |
+
for i, segment in enumerate(segments):
|
39 |
+
output += f"{i + 1}\n"
|
40 |
+
output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
|
41 |
+
if segment['text'].startswith(' '):
|
42 |
+
segment['text'] = segment['text'][1:]
|
43 |
+
output += f"{segment['text']}\n\n"
|
44 |
+
return output
|
45 |
+
|
46 |
+
|
47 |
+
def get_txt(segments):
|
48 |
+
output = ""
|
49 |
+
for i, segment in enumerate(segments):
|
50 |
+
if segment['text'].startswith(' '):
|
51 |
+
segment['text'] = segment['text'][1:]
|
52 |
+
output += f"{segment['text']}\n"
|
53 |
+
return output
|
54 |
+
|
55 |
+
|
56 |
+
def parse_srt(file_path):
|
57 |
+
"""Reads SRT file and returns as dict"""
|
58 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
59 |
+
srt_data = file.read()
|
60 |
+
|
61 |
+
data = []
|
62 |
+
blocks = srt_data.split('\n\n')
|
63 |
+
|
64 |
+
for block in blocks:
|
65 |
+
if block.strip() != '':
|
66 |
+
lines = block.strip().split('\n')
|
67 |
+
index = lines[0]
|
68 |
+
timestamp = lines[1]
|
69 |
+
sentence = ' '.join(lines[2:])
|
70 |
+
|
71 |
+
data.append({
|
72 |
+
"index": index,
|
73 |
+
"timestamp": timestamp,
|
74 |
+
"sentence": sentence
|
75 |
+
})
|
76 |
+
return data
|
77 |
+
|
78 |
+
|
79 |
+
def parse_vtt(file_path):
|
80 |
+
"""Reads WebVTT file and returns as dict"""
|
81 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
82 |
+
webvtt_data = file.read()
|
83 |
+
|
84 |
+
data = []
|
85 |
+
blocks = webvtt_data.split('\n\n')
|
86 |
+
|
87 |
+
for block in blocks:
|
88 |
+
if block.strip() != '' and not block.strip().startswith("WebVTT"):
|
89 |
+
lines = block.strip().split('\n')
|
90 |
+
index = lines[0]
|
91 |
+
timestamp = lines[1]
|
92 |
+
sentence = ' '.join(lines[2:])
|
93 |
+
|
94 |
+
data.append({
|
95 |
+
"index": index,
|
96 |
+
"timestamp": timestamp,
|
97 |
+
"sentence": sentence
|
98 |
+
})
|
99 |
+
|
100 |
+
return data
|
101 |
+
|
102 |
+
|
103 |
+
def get_serialized_srt(dicts):
|
104 |
+
output = ""
|
105 |
+
for dic in dicts:
|
106 |
+
output += f'{dic["index"]}\n'
|
107 |
+
output += f'{dic["timestamp"]}\n'
|
108 |
+
output += f'{dic["sentence"]}\n\n'
|
109 |
+
return output
|
110 |
+
|
111 |
+
|
112 |
+
def get_serialized_vtt(dicts):
|
113 |
+
output = "WebVTT\n\n"
|
114 |
+
for dic in dicts:
|
115 |
+
output += f'{dic["index"]}\n'
|
116 |
+
output += f'{dic["timestamp"]}\n'
|
117 |
+
output += f'{dic["sentence"]}\n\n'
|
118 |
+
return output
|
119 |
+
|
120 |
+
|
121 |
+
def safe_filename(name):
|
122 |
+
from app import _args
|
123 |
+
INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
|
124 |
+
safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
|
125 |
+
if not _args.colab:
|
126 |
+
return safe_name
|
127 |
+
# Truncate the filename if it exceeds the max_length (20)
|
128 |
+
if len(safe_name) > 20:
|
129 |
+
file_extension = safe_name.split('.')[-1]
|
130 |
+
if len(file_extension) + 1 < 20:
|
131 |
+
truncated_name = safe_name[:20 - len(file_extension) - 1]
|
132 |
+
safe_name = truncated_name + '.' + file_extension
|
133 |
+
else:
|
134 |
+
safe_name = safe_name[:20]
|
135 |
+
return safe_name
|
modules/whisper_Inference.py
ADDED
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
import gradio as gr
|
3 |
+
import time
|
4 |
+
import os
|
5 |
+
from typing import BinaryIO, Union, Tuple
|
6 |
+
import numpy as np
|
7 |
+
from datetime import datetime
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from .base_interface import BaseInterface
|
11 |
+
from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
|
12 |
+
from modules.youtube_manager import get_ytdata, get_ytaudio
|
13 |
+
|
14 |
+
DEFAULT_MODEL_SIZE = "large-v3"
|
15 |
+
|
16 |
+
|
17 |
+
class WhisperInference(BaseInterface):
|
18 |
+
def __init__(self):
|
19 |
+
super().__init__()
|
20 |
+
self.current_model_size = None
|
21 |
+
self.model = None
|
22 |
+
self.available_models = whisper.available_models()
|
23 |
+
self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
|
24 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
25 |
+
self.available_compute_types = ["float16", "float32"]
|
26 |
+
self.current_compute_type = "float16" if self.device == "cuda" else "float32"
|
27 |
+
self.default_beam_size = 1
|
28 |
+
|
29 |
+
def transcribe_file(self,
|
30 |
+
fileobjs: list,
|
31 |
+
model_size: str,
|
32 |
+
lang: str,
|
33 |
+
file_format: str,
|
34 |
+
istranslate: bool,
|
35 |
+
add_timestamp: bool,
|
36 |
+
beam_size: int,
|
37 |
+
log_prob_threshold: float,
|
38 |
+
no_speech_threshold: float,
|
39 |
+
compute_type: str,
|
40 |
+
progress=gr.Progress()) -> list:
|
41 |
+
"""
|
42 |
+
Write subtitle file from Files
|
43 |
+
|
44 |
+
Parameters
|
45 |
+
----------
|
46 |
+
fileobjs: list
|
47 |
+
List of files to transcribe from gr.Files()
|
48 |
+
model_size: str
|
49 |
+
Whisper model size from gr.Dropdown()
|
50 |
+
lang: str
|
51 |
+
Source language of the file to transcribe from gr.Dropdown()
|
52 |
+
file_format: str
|
53 |
+
File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
|
54 |
+
istranslate: bool
|
55 |
+
Boolean value from gr.Checkbox() that determines whether to translate to English.
|
56 |
+
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
57 |
+
add_timestamp: bool
|
58 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
59 |
+
beam_size: int
|
60 |
+
Int value from gr.Number() that is used for decoding option.
|
61 |
+
log_prob_threshold: float
|
62 |
+
float value from gr.Number(). If the average log probability over sampled tokens is
|
63 |
+
below this value, treat as failed.
|
64 |
+
no_speech_threshold: float
|
65 |
+
float value from gr.Number(). If the no_speech probability is higher than this value AND
|
66 |
+
the average log probability over sampled tokens is below `log_prob_threshold`,
|
67 |
+
consider the segment as silent.
|
68 |
+
compute_type: str
|
69 |
+
compute type from gr.Dropdown().
|
70 |
+
progress: gr.Progress
|
71 |
+
Indicator to show progress directly in gradio.
|
72 |
+
I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
|
73 |
+
|
74 |
+
Returns
|
75 |
+
----------
|
76 |
+
A List of
|
77 |
+
String to return to gr.Textbox()
|
78 |
+
Files to return to gr.Files()
|
79 |
+
"""
|
80 |
+
try:
|
81 |
+
self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
|
82 |
+
|
83 |
+
files_info = {}
|
84 |
+
for fileobj in fileobjs:
|
85 |
+
progress(0, desc="Loading Audio..")
|
86 |
+
audio = whisper.load_audio(fileobj.name)
|
87 |
+
|
88 |
+
result, elapsed_time = self.transcribe(audio=audio,
|
89 |
+
lang=lang,
|
90 |
+
istranslate=istranslate,
|
91 |
+
beam_size=beam_size,
|
92 |
+
log_prob_threshold=log_prob_threshold,
|
93 |
+
no_speech_threshold=no_speech_threshold,
|
94 |
+
compute_type=compute_type,
|
95 |
+
progress=progress
|
96 |
+
)
|
97 |
+
progress(1, desc="Completed!")
|
98 |
+
|
99 |
+
file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
|
100 |
+
file_name = safe_filename(file_name)
|
101 |
+
subtitle, file_path = self.generate_and_write_file(
|
102 |
+
file_name=file_name,
|
103 |
+
transcribed_segments=result,
|
104 |
+
add_timestamp=add_timestamp,
|
105 |
+
file_format=file_format
|
106 |
+
)
|
107 |
+
files_info[file_name] = {"subtitle": subtitle, "elapsed_time": elapsed_time, "path": file_path}
|
108 |
+
|
109 |
+
total_result = ''
|
110 |
+
total_time = 0
|
111 |
+
for file_name, info in files_info.items():
|
112 |
+
total_result += '------------------------------------\n'
|
113 |
+
total_result += f'{file_name}\n\n'
|
114 |
+
total_result += f"{info['subtitle']}"
|
115 |
+
total_time += info["elapsed_time"]
|
116 |
+
|
117 |
+
gr_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
|
118 |
+
gr_file_path = [info['path'] for info in files_info.values()]
|
119 |
+
|
120 |
+
return [gr_str, gr_file_path]
|
121 |
+
except Exception as e:
|
122 |
+
print(f"Error transcribing file: {str(e)}")
|
123 |
+
finally:
|
124 |
+
self.release_cuda_memory()
|
125 |
+
self.remove_input_files([fileobj.name for fileobj in fileobjs])
|
126 |
+
|
127 |
+
def transcribe_youtube(self,
|
128 |
+
youtubelink: str,
|
129 |
+
model_size: str,
|
130 |
+
lang: str,
|
131 |
+
file_format: str,
|
132 |
+
istranslate: bool,
|
133 |
+
add_timestamp: bool,
|
134 |
+
beam_size: int,
|
135 |
+
log_prob_threshold: float,
|
136 |
+
no_speech_threshold: float,
|
137 |
+
compute_type: str,
|
138 |
+
progress=gr.Progress()) -> list:
|
139 |
+
"""
|
140 |
+
Write subtitle file from Youtube
|
141 |
+
|
142 |
+
Parameters
|
143 |
+
----------
|
144 |
+
youtubelink: str
|
145 |
+
Link of Youtube to transcribe from gr.Textbox()
|
146 |
+
model_size: str
|
147 |
+
Whisper model size from gr.Dropdown()
|
148 |
+
lang: str
|
149 |
+
Source language of the file to transcribe from gr.Dropdown()
|
150 |
+
file_format: str
|
151 |
+
File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
|
152 |
+
istranslate: bool
|
153 |
+
Boolean value from gr.Checkbox() that determines whether to translate to English.
|
154 |
+
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
155 |
+
add_timestamp: bool
|
156 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
157 |
+
beam_size: int
|
158 |
+
Int value from gr.Number() that is used for decoding option.
|
159 |
+
log_prob_threshold: float
|
160 |
+
float value from gr.Number(). If the average log probability over sampled tokens is
|
161 |
+
below this value, treat as failed.
|
162 |
+
no_speech_threshold: float
|
163 |
+
float value from gr.Number(). If the no_speech probability is higher than this value AND
|
164 |
+
the average log probability over sampled tokens is below `log_prob_threshold`,
|
165 |
+
consider the segment as silent.
|
166 |
+
compute_type: str
|
167 |
+
compute type from gr.Dropdown().
|
168 |
+
progress: gr.Progress
|
169 |
+
Indicator to show progress directly in gradio.
|
170 |
+
I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
|
171 |
+
|
172 |
+
Returns
|
173 |
+
----------
|
174 |
+
A List of
|
175 |
+
String to return to gr.Textbox()
|
176 |
+
Files to return to gr.Files()
|
177 |
+
"""
|
178 |
+
try:
|
179 |
+
self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
|
180 |
+
|
181 |
+
progress(0, desc="Loading Audio from Youtube..")
|
182 |
+
yt = get_ytdata(youtubelink)
|
183 |
+
audio = whisper.load_audio(get_ytaudio(yt))
|
184 |
+
|
185 |
+
result, elapsed_time = self.transcribe(audio=audio,
|
186 |
+
lang=lang,
|
187 |
+
istranslate=istranslate,
|
188 |
+
beam_size=beam_size,
|
189 |
+
log_prob_threshold=log_prob_threshold,
|
190 |
+
no_speech_threshold=no_speech_threshold,
|
191 |
+
compute_type=compute_type,
|
192 |
+
progress=progress)
|
193 |
+
progress(1, desc="Completed!")
|
194 |
+
|
195 |
+
file_name = safe_filename(yt.title)
|
196 |
+
subtitle, file_path = self.generate_and_write_file(
|
197 |
+
file_name=file_name,
|
198 |
+
transcribed_segments=result,
|
199 |
+
add_timestamp=add_timestamp,
|
200 |
+
file_format=file_format
|
201 |
+
)
|
202 |
+
|
203 |
+
gr_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
204 |
+
return [gr_str, file_path]
|
205 |
+
except Exception as e:
|
206 |
+
print(f"Error transcribing youtube video: {str(e)}")
|
207 |
+
finally:
|
208 |
+
try:
|
209 |
+
if 'yt' not in locals():
|
210 |
+
yt = get_ytdata(youtubelink)
|
211 |
+
file_path = get_ytaudio(yt)
|
212 |
+
else:
|
213 |
+
file_path = get_ytaudio(yt)
|
214 |
+
|
215 |
+
self.release_cuda_memory()
|
216 |
+
self.remove_input_files([file_path])
|
217 |
+
except Exception as cleanup_error:
|
218 |
+
pass
|
219 |
+
|
220 |
+
def transcribe_mic(self,
|
221 |
+
micaudio: str,
|
222 |
+
model_size: str,
|
223 |
+
lang: str,
|
224 |
+
file_format: str,
|
225 |
+
istranslate: bool,
|
226 |
+
beam_size: int,
|
227 |
+
log_prob_threshold: float,
|
228 |
+
no_speech_threshold: float,
|
229 |
+
compute_type: str,
|
230 |
+
progress=gr.Progress()) -> list:
|
231 |
+
"""
|
232 |
+
Write subtitle file from microphone
|
233 |
+
|
234 |
+
Parameters
|
235 |
+
----------
|
236 |
+
micaudio: str
|
237 |
+
Audio file path from gr.Microphone()
|
238 |
+
model_size: str
|
239 |
+
Whisper model size from gr.Dropdown()
|
240 |
+
lang: str
|
241 |
+
Source language of the file to transcribe from gr.Dropdown()
|
242 |
+
file_format: str
|
243 |
+
Subtitle format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
|
244 |
+
istranslate: bool
|
245 |
+
Boolean value from gr.Checkbox() that determines whether to translate to English.
|
246 |
+
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
247 |
+
beam_size: int
|
248 |
+
Int value from gr.Number() that is used for decoding option.
|
249 |
+
log_prob_threshold: float
|
250 |
+
float value from gr.Number(). If the average log probability over sampled tokens is
|
251 |
+
below this value, treat as failed.
|
252 |
+
no_speech_threshold: float
|
253 |
+
float value from gr.Number(). If the no_speech probability is higher than this value AND
|
254 |
+
the average log probability over sampled tokens is below `log_prob_threshold`,
|
255 |
+
consider the segment as silent.
|
256 |
+
compute_type: str
|
257 |
+
compute type from gr.Dropdown().
|
258 |
+
progress: gr.Progress
|
259 |
+
Indicator to show progress directly in gradio.
|
260 |
+
I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
|
261 |
+
|
262 |
+
Returns
|
263 |
+
----------
|
264 |
+
A List of
|
265 |
+
String to return to gr.Textbox()
|
266 |
+
Files to return to gr.Files()
|
267 |
+
"""
|
268 |
+
try:
|
269 |
+
self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
|
270 |
+
|
271 |
+
result, elapsed_time = self.transcribe(audio=micaudio,
|
272 |
+
lang=lang,
|
273 |
+
istranslate=istranslate,
|
274 |
+
beam_size=beam_size,
|
275 |
+
log_prob_threshold=log_prob_threshold,
|
276 |
+
no_speech_threshold=no_speech_threshold,
|
277 |
+
compute_type=compute_type,
|
278 |
+
progress=progress)
|
279 |
+
progress(1, desc="Completed!")
|
280 |
+
|
281 |
+
subtitle, file_path = self.generate_and_write_file(
|
282 |
+
file_name="Mic",
|
283 |
+
transcribed_segments=result,
|
284 |
+
add_timestamp=True,
|
285 |
+
file_format=file_format
|
286 |
+
)
|
287 |
+
|
288 |
+
gr_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
289 |
+
return [gr_str, file_path]
|
290 |
+
except Exception as e:
|
291 |
+
print(f"Error transcribing mic: {str(e)}")
|
292 |
+
finally:
|
293 |
+
self.release_cuda_memory()
|
294 |
+
self.remove_input_files([micaudio])
|
295 |
+
|
296 |
+
def transcribe(self,
|
297 |
+
audio: Union[str, np.ndarray, torch.Tensor],
|
298 |
+
lang: str,
|
299 |
+
istranslate: bool,
|
300 |
+
beam_size: int,
|
301 |
+
log_prob_threshold: float,
|
302 |
+
no_speech_threshold: float,
|
303 |
+
compute_type: str,
|
304 |
+
progress: gr.Progress
|
305 |
+
) -> Tuple[list[dict], float]:
|
306 |
+
"""
|
307 |
+
transcribe method for OpenAI's Whisper implementation.
|
308 |
+
|
309 |
+
Parameters
|
310 |
+
----------
|
311 |
+
audio: Union[str, BinaryIO, torch.Tensor]
|
312 |
+
Audio path or file binary or Audio numpy array
|
313 |
+
lang: str
|
314 |
+
Source language of the file to transcribe from gr.Dropdown()
|
315 |
+
istranslate: bool
|
316 |
+
Boolean value from gr.Checkbox() that determines whether to translate to English.
|
317 |
+
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
318 |
+
beam_size: int
|
319 |
+
Int value from gr.Number() that is used for decoding option.
|
320 |
+
log_prob_threshold: float
|
321 |
+
float value from gr.Number(). If the average log probability over sampled tokens is
|
322 |
+
below this value, treat as failed.
|
323 |
+
no_speech_threshold: float
|
324 |
+
float value from gr.Number(). If the no_speech probability is higher than this value AND
|
325 |
+
the average log probability over sampled tokens is below `log_prob_threshold`,
|
326 |
+
consider the segment as silent.
|
327 |
+
compute_type: str
|
328 |
+
compute type from gr.Dropdown().
|
329 |
+
progress: gr.Progress
|
330 |
+
Indicator to show progress directly in gradio.
|
331 |
+
|
332 |
+
Returns
|
333 |
+
----------
|
334 |
+
segments_result: list[dict]
|
335 |
+
list of dicts that includes start, end timestamps and transcribed text
|
336 |
+
elapsed_time: float
|
337 |
+
elapsed time for transcription
|
338 |
+
"""
|
339 |
+
start_time = time.time()
|
340 |
+
|
341 |
+
def progress_callback(progress_value):
|
342 |
+
progress(progress_value, desc="Transcribing..")
|
343 |
+
|
344 |
+
if lang == "Automatic Detection":
|
345 |
+
lang = None
|
346 |
+
|
347 |
+
translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
|
348 |
+
segments_result = self.model.transcribe(audio=audio,
|
349 |
+
language=lang,
|
350 |
+
verbose=False,
|
351 |
+
beam_size=beam_size,
|
352 |
+
logprob_threshold=log_prob_threshold,
|
353 |
+
no_speech_threshold=no_speech_threshold,
|
354 |
+
task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
|
355 |
+
fp16=True if compute_type == "float16" else False,
|
356 |
+
progress_callback=progress_callback)["segments"]
|
357 |
+
elapsed_time = time.time() - start_time
|
358 |
+
|
359 |
+
return segments_result, elapsed_time
|
360 |
+
|
361 |
+
def update_model_if_needed(self,
|
362 |
+
model_size: str,
|
363 |
+
compute_type: str,
|
364 |
+
progress: gr.Progress,
|
365 |
+
):
|
366 |
+
"""
|
367 |
+
Initialize model if it doesn't match with current model setting
|
368 |
+
"""
|
369 |
+
if compute_type != self.current_compute_type:
|
370 |
+
self.current_compute_type = compute_type
|
371 |
+
if model_size != self.current_model_size or self.model is None:
|
372 |
+
progress(0, desc="Initializing Model..")
|
373 |
+
self.current_model_size = model_size
|
374 |
+
self.model = whisper.load_model(
|
375 |
+
name=model_size,
|
376 |
+
device=self.device,
|
377 |
+
download_root=os.path.join("models", "Whisper")
|
378 |
+
)
|
379 |
+
|
380 |
+
@staticmethod
|
381 |
+
def generate_and_write_file(file_name: str,
|
382 |
+
transcribed_segments: list,
|
383 |
+
add_timestamp: bool,
|
384 |
+
file_format: str,
|
385 |
+
) -> str:
|
386 |
+
"""
|
387 |
+
This method writes subtitle file and returns str to gr.Textbox
|
388 |
+
"""
|
389 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
390 |
+
if add_timestamp:
|
391 |
+
output_path = os.path.join("outputs", f"{file_name}-{timestamp}")
|
392 |
+
else:
|
393 |
+
output_path = os.path.join("outputs", f"{file_name}")
|
394 |
+
|
395 |
+
if file_format == "SRT":
|
396 |
+
content = get_srt(transcribed_segments)
|
397 |
+
output_path += '.srt'
|
398 |
+
write_file(content, output_path)
|
399 |
+
|
400 |
+
elif file_format == "WebVTT":
|
401 |
+
content = get_vtt(transcribed_segments)
|
402 |
+
output_path += '.vtt'
|
403 |
+
write_file(content, output_path)
|
404 |
+
|
405 |
+
elif file_format == "txt":
|
406 |
+
content = get_txt(transcribed_segments)
|
407 |
+
output_path += '.txt'
|
408 |
+
write_file(content, output_path)
|
409 |
+
return content, output_path
|
410 |
+
|
411 |
+
@staticmethod
|
412 |
+
def format_time(elapsed_time: float) -> str:
|
413 |
+
hours, rem = divmod(elapsed_time, 3600)
|
414 |
+
minutes, seconds = divmod(rem, 60)
|
415 |
+
|
416 |
+
time_str = ""
|
417 |
+
if hours:
|
418 |
+
time_str += f"{hours} hours "
|
419 |
+
if minutes:
|
420 |
+
time_str += f"{minutes} minutes "
|
421 |
+
seconds = round(seconds)
|
422 |
+
time_str += f"{seconds} seconds"
|
423 |
+
|
424 |
+
return time_str.strip()
|
modules/youtube_manager.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pytube import YouTube
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
def get_ytdata(link):
|
6 |
+
return YouTube(link)
|
7 |
+
|
8 |
+
|
9 |
+
def get_ytmetas(link):
|
10 |
+
yt = YouTube(link)
|
11 |
+
return yt.thumbnail_url, yt.title, yt.description
|
12 |
+
|
13 |
+
|
14 |
+
def get_ytaudio(ytdata: YouTube):
|
15 |
+
return ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))
|
notebook/whisper-webui.ipynb
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"source": [
|
6 |
+
"---\n",
|
7 |
+
"\n",
|
8 |
+
"📌 **This notebook has been updated [here](https://github.com/jhj0517/Whisper-WebUI.git)!**\n",
|
9 |
+
"\n",
|
10 |
+
"🖋 **Author**: [jhj0517](https://github.com/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)\n",
|
11 |
+
"\n",
|
12 |
+
"😎 **Support the Project**:\n",
|
13 |
+
"\n",
|
14 |
+
"If you find this project useful, please consider supporting it:\n",
|
15 |
+
"\n",
|
16 |
+
"<a href=\"https://www.buymeacoffee.com/jhj0517\" target=\"_blank\" style=\"margin-right: 10px;\">\n",
|
17 |
+
" <img src=\"https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png\" alt=\"Buy Me A Coffee\" width=\"158\" height=\"36\">\n",
|
18 |
+
"</a>\n",
|
19 |
+
"<a href=\"https://ko-fi.com/A0A7JSQRJ\" target=\"_blank\">\n",
|
20 |
+
" <img src=\"https://storage.ko-fi.com/cdn/kofi2.png?v=3\" alt=\"Buy Me a Coffee at ko-fi.com\" height=\"36\">\n",
|
21 |
+
"</a>\n",
|
22 |
+
"\n",
|
23 |
+
"---"
|
24 |
+
],
|
25 |
+
"metadata": {
|
26 |
+
"id": "doKhBBXIfS21"
|
27 |
+
}
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"source": [
|
32 |
+
"#@title #(Optional) Check GPU\n",
|
33 |
+
"#@markdown Some models may not function correctly on a CPU runtime.\n",
|
34 |
+
"\n",
|
35 |
+
"#@markdown so you should check your GPU setup before run.\n",
|
36 |
+
"!nvidia-smi"
|
37 |
+
],
|
38 |
+
"metadata": {
|
39 |
+
"id": "23yZvUlagEsx"
|
40 |
+
},
|
41 |
+
"execution_count": null,
|
42 |
+
"outputs": []
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": null,
|
47 |
+
"metadata": {
|
48 |
+
"id": "kNbSbsctxahq",
|
49 |
+
"cellView": "form"
|
50 |
+
},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"#@title #Installation\n",
|
54 |
+
"#@markdown This cell will install dependencies for Whisper-WebUI!\n",
|
55 |
+
"!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
|
56 |
+
"%cd Whisper-WebUI\n",
|
57 |
+
"!pip install -r requirements.txt"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "code",
|
62 |
+
"source": [
|
63 |
+
"#@title # (Optional) Configure arguments\n",
|
64 |
+
"#@markdown This section is used to configure some command line arguments.\n",
|
65 |
+
"\n",
|
66 |
+
"#@markdown You can simply ignore this section and the default values will be used.\n",
|
67 |
+
"\n",
|
68 |
+
"USERNAME = '' #@param {type: \"string\"}\n",
|
69 |
+
"PASSWORD = '' #@param {type: \"string\"}\n",
|
70 |
+
"DISABLE_FASTER_WHISPER = False #@param {type: \"boolean\"}\n",
|
71 |
+
"THEME = '' #@param {type: \"string\"}\n",
|
72 |
+
"\n",
|
73 |
+
"arguments = \"\"\n",
|
74 |
+
"if USERNAME:\n",
|
75 |
+
" arguments += f\" --username {USERNAME}\"\n",
|
76 |
+
"if PASSWORD:\n",
|
77 |
+
" arguments += f\" --password {PASSWORD}\"\n",
|
78 |
+
"if THEME:\n",
|
79 |
+
" arguments += f\" --theme {THEME}\"\n",
|
80 |
+
"if DISABLE_FASTER_WHISPER:\n",
|
81 |
+
" arguments += f\" --disable_faster_whisper\"\n",
|
82 |
+
"\n",
|
83 |
+
"\n",
|
84 |
+
"#@markdown If you wonder how these arguments are used, you can see the [Wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments)."
|
85 |
+
],
|
86 |
+
"metadata": {
|
87 |
+
"id": "Qosz9BFlGui3"
|
88 |
+
},
|
89 |
+
"execution_count": null,
|
90 |
+
"outputs": []
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"cell_type": "code",
|
94 |
+
"execution_count": null,
|
95 |
+
"metadata": {
|
96 |
+
"id": "PQroYRRZzQiN"
|
97 |
+
},
|
98 |
+
"outputs": [],
|
99 |
+
"source": [
|
100 |
+
"#@title #Run\n",
|
101 |
+
"#@markdown Once the installation is complete, you can use public URL that is displayed.\n",
|
102 |
+
"if 'arguments' in locals():\n",
|
103 |
+
" !python app.py --share --colab{arguments}\n",
|
104 |
+
"else:\n",
|
105 |
+
" !python app.py --share --colab"
|
106 |
+
]
|
107 |
+
}
|
108 |
+
],
|
109 |
+
"metadata": {
|
110 |
+
"colab": {
|
111 |
+
"provenance": [],
|
112 |
+
"gpuType": "T4"
|
113 |
+
},
|
114 |
+
"kernelspec": {
|
115 |
+
"display_name": "Python 3",
|
116 |
+
"name": "python3"
|
117 |
+
},
|
118 |
+
"language_info": {
|
119 |
+
"name": "python"
|
120 |
+
},
|
121 |
+
"accelerator": "GPU"
|
122 |
+
},
|
123 |
+
"nbformat": 4,
|
124 |
+
"nbformat_minor": 0
|
125 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
--extra-index-url https://download.pytorch.org/whl/cu118
|
2 |
+
torch
|
3 |
+
git+https://github.com/jhj0517/jhj0517-whisper.git
|
4 |
+
faster-whisper
|
5 |
+
transformers
|
6 |
+
gradio==4.14.0
|
7 |
+
pytube
|
screenshot.png
ADDED
start-webui.bat
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@echo off
|
2 |
+
|
3 |
+
goto :activate_venv
|
4 |
+
|
5 |
+
:launch
|
6 |
+
%PYTHON% app.py %*
|
7 |
+
pause
|
8 |
+
|
9 |
+
:activate_venv
|
10 |
+
set PYTHON="%~dp0\venv\Scripts\Python.exe"
|
11 |
+
echo venv %PYTHON%
|
12 |
+
goto :launch
|
13 |
+
|
14 |
+
:endofscript
|
15 |
+
|
16 |
+
echo.
|
17 |
+
echo Launch unsuccessful. Exiting.
|
18 |
+
pause
|
start-webui.sh
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
source venv/bin/activate
|
4 |
+
|
5 |
+
PYTHON="venv/bin/python"
|
6 |
+
echo "venv ${PYTHON}"
|
7 |
+
echo ""
|
8 |
+
|
9 |
+
python app.py $*
|
10 |
+
|
11 |
+
deactivate
|
12 |
+
|
ui/__init__.py
ADDED
File without changes
|
ui/htmls.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CSS = """
|
2 |
+
.bmc-button {
|
3 |
+
padding: 2px 5px;
|
4 |
+
border-radius: 5px;
|
5 |
+
background-color: #FF813F;
|
6 |
+
color: white;
|
7 |
+
box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.3);
|
8 |
+
text-decoration: none;
|
9 |
+
display: inline-block;
|
10 |
+
font-size: 20px;
|
11 |
+
margin: 2px;
|
12 |
+
cursor: pointer;
|
13 |
+
-webkit-transition: background-color 0.3s ease;
|
14 |
+
-ms-transition: background-color 0.3s ease;
|
15 |
+
transition: background-color 0.3s ease;
|
16 |
+
}
|
17 |
+
.bmc-button:hover,
|
18 |
+
.bmc-button:active,
|
19 |
+
.bmc-button:focus {
|
20 |
+
background-color: #FF5633;
|
21 |
+
}
|
22 |
+
.markdown {
|
23 |
+
margin-bottom: 0;
|
24 |
+
padding-bottom: 0;
|
25 |
+
}
|
26 |
+
.tabs {
|
27 |
+
margin-top: 0;
|
28 |
+
padding-top: 0;
|
29 |
+
}
|
30 |
+
|
31 |
+
#md_project a {
|
32 |
+
color: black;
|
33 |
+
text-decoration: none;
|
34 |
+
}
|
35 |
+
#md_project a:hover {
|
36 |
+
text-decoration: underline;
|
37 |
+
}
|
38 |
+
"""
|
39 |
+
|
40 |
+
MARKDOWN = """
|
41 |
+
### [Whisper Web-UI](https://github.com/jhj0517/Whsiper-WebUI)
|
42 |
+
"""
|
43 |
+
|
44 |
+
|
45 |
+
NLLB_VRAM_TABLE = """
|
46 |
+
<!DOCTYPE html>
|
47 |
+
<html lang="en">
|
48 |
+
<head>
|
49 |
+
<meta charset="UTF-8">
|
50 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
51 |
+
<style>
|
52 |
+
table {
|
53 |
+
border-collapse: collapse;
|
54 |
+
width: 100%;
|
55 |
+
}
|
56 |
+
th, td {
|
57 |
+
border: 1px solid #dddddd;
|
58 |
+
text-align: left;
|
59 |
+
padding: 8px;
|
60 |
+
}
|
61 |
+
th {
|
62 |
+
background-color: #f2f2f2;
|
63 |
+
}
|
64 |
+
</style>
|
65 |
+
</head>
|
66 |
+
<body>
|
67 |
+
|
68 |
+
<details>
|
69 |
+
<summary>VRAM usage for each model</summary>
|
70 |
+
<table>
|
71 |
+
<thead>
|
72 |
+
<tr>
|
73 |
+
<th>Model name</th>
|
74 |
+
<th>Required VRAM</th>
|
75 |
+
</tr>
|
76 |
+
</thead>
|
77 |
+
<tbody>
|
78 |
+
<tr>
|
79 |
+
<td>nllb-200-3.3B</td>
|
80 |
+
<td>~16GB</td>
|
81 |
+
</tr>
|
82 |
+
<tr>
|
83 |
+
<td>nllb-200-1.3B</td>
|
84 |
+
<td>~8GB</td>
|
85 |
+
</tr>
|
86 |
+
<tr>
|
87 |
+
<td>nllb-200-distilled-600M</td>
|
88 |
+
<td>~4GB</td>
|
89 |
+
</tr>
|
90 |
+
</tbody>
|
91 |
+
</table>
|
92 |
+
<p><strong>Note:</strong> Be mindful of your VRAM! The table above provides an approximate VRAM usage for each model.</p>
|
93 |
+
</details>
|
94 |
+
|
95 |
+
</body>
|
96 |
+
</html>
|
97 |
+
"""
|
user-start-webui.bat
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@echo off
|
2 |
+
:: This batch file is for launching with command line args
|
3 |
+
:: See the wiki for a guide to command line arguments: https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments
|
4 |
+
:: Set the values here to whatever you want. See the wiki above for how to set this.
|
5 |
+
set SERVER_NAME=
|
6 |
+
set SERVER_PORT=
|
7 |
+
set USERNAME=
|
8 |
+
set PASSWORD=
|
9 |
+
set SHARE=
|
10 |
+
set THEME=
|
11 |
+
set DISABLE_FASTER_WHISPER=
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
:: Set args accordingly
|
17 |
+
if not "%SERVER_NAME%"=="" (
|
18 |
+
set SERVER_NAME_ARG=--server_name %SERVER_NAME%
|
19 |
+
)
|
20 |
+
if not "%SERVER_PORT%"=="" (
|
21 |
+
set SERVER_PORT_ARG=--server_port %SERVER_PORT%
|
22 |
+
)
|
23 |
+
if not "%USERNAME%"=="" (
|
24 |
+
set USERNAME_ARG=--username %USERNAME%
|
25 |
+
)
|
26 |
+
if not "%PASSWORD%"=="" (
|
27 |
+
set PASSWORD_ARG=--password %PASSWORD%
|
28 |
+
)
|
29 |
+
if /I "%SHARE%"=="true" (
|
30 |
+
set SHARE_ARG=--share
|
31 |
+
)
|
32 |
+
if not "%THEME%"=="" (
|
33 |
+
set THEME_ARG=--theme %THEME%
|
34 |
+
)
|
35 |
+
if /I "%DISABLE_FASTER_WHISPER%"=="true" (
|
36 |
+
set DISABLE_FASTER_WHISPER_ARG=--disable_faster_whisper
|
37 |
+
)
|
38 |
+
|
39 |
+
:: Call the original .bat script with optional arguments
|
40 |
+
start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %DISABLE_FASTER_WHISPER_ARG%
|
41 |
+
pause
|