Update app.py
Browse files
app.py
CHANGED
@@ -21,6 +21,9 @@ import pandas as pd
|
|
21 |
import re
|
22 |
import time
|
23 |
|
|
|
|
|
|
|
24 |
from pytube import YouTube
|
25 |
import torch
|
26 |
|
@@ -28,11 +31,13 @@ INTRO_MSG = '''
|
|
28 |
#### <p>There are many not very widely spoken languages for which it is quite hard to find learning materials,
|
29 |
especially well dubbed videos (target language video with target language subs).
|
30 |
This tool will hopefully transcribe and add subs to your videos.
|
31 |
-
At least for me this is a nice tool to practice both listening and reading skills.
|
|
|
32 |
<p>Speech Recognition is based on models from OpenAI Whisper - https://github.com/openai/whisper
|
33 |
<p> This space is using the c++ implementation by https://github.com/ggerganov/whisper.cpp
|
34 |
'''
|
35 |
|
|
|
36 |
whisper_models = MODELS_TO_DOWNLOAD #["medium"]#["base", "small", "medium", "large", "base.en"]
|
37 |
|
38 |
custom_models = []
|
@@ -42,6 +47,104 @@ combined_models.extend(custom_models)
|
|
42 |
|
43 |
LANGUAGES = {
|
44 |
"bg": "Bulgarian",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
}
|
46 |
|
47 |
# language code lookup by name, with a few language aliases
|
@@ -60,7 +163,27 @@ def get_youtube(video_url):
|
|
60 |
print(f"Download complete - {abs_video_path}")
|
61 |
return abs_video_path
|
62 |
|
63 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
"""
|
65 |
Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
|
66 |
This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
|
@@ -90,8 +213,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
90 |
try:
|
91 |
print("starting whisper c++")
|
92 |
os.system(f'rm -f {srt_path}')
|
93 |
-
|
94 |
-
|
|
|
95 |
print("whisper c++ finished")
|
96 |
except Exception as e:
|
97 |
raise RuntimeError("Error running Whisper cpp model")
|
@@ -149,6 +273,7 @@ subtitle_files = gr.File(
|
|
149 |
video_player = gr.HTML('<p>video will be played here')
|
150 |
eventslider = gr.Slider(visible=False)
|
151 |
status_msg = gr.Markdown('Status')
|
|
|
152 |
|
153 |
demo = gr.Blocks()
|
154 |
demo.encrypt = False
|
@@ -157,24 +282,26 @@ def set_app_msg(app_state, msg):
|
|
157 |
app_state['status_msg'] = msg
|
158 |
|
159 |
def transcribe(app_state, youtube_url_in, selected_source_lang, selected_whisper_model):
|
|
|
160 |
set_app_msg(app_state, 'Downloading the movie ...')
|
161 |
video_file_path = get_youtube(youtube_url_in)
|
162 |
set_app_msg(app_state, f'Running the speech to text model {selected_source_lang}/{selected_whisper_model}. This can take some time.')
|
163 |
-
subtitle_files = speech_to_text(video_file_path, selected_source_lang, selected_whisper_model)
|
164 |
set_app_msg(app_state, f'Creating the video player ...')
|
165 |
video_player = create_video_player(subtitle_files, video_file_path)
|
166 |
-
set_app_msg(app_state, f'Transcribing done, generating video player
|
167 |
return subtitle_files, video_player
|
168 |
|
169 |
|
170 |
def on_change_event(app_state):
|
171 |
-
print('Running!')
|
172 |
-
return app_state['status_msg']
|
173 |
|
174 |
with demo:
|
175 |
app_state = gr.State({
|
176 |
-
'running':False,
|
177 |
-
'status_msg': ''
|
|
|
178 |
})
|
179 |
|
180 |
with gr.Row():
|
@@ -196,12 +323,13 @@ with demo:
|
|
196 |
|
197 |
eventslider.render()
|
198 |
status_msg.render()
|
|
|
199 |
subtitle_files.render()
|
200 |
video_player.render()
|
201 |
with gr.Row():
|
202 |
gr.Markdown('This app is based on [this code](https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles/tree/main) by RASMUS.')
|
203 |
|
204 |
-
dep = demo.load(on_change_event, inputs=[app_state], outputs=[status_msg], every=10)
|
205 |
|
206 |
|
207 |
#### RUN ###
|
|
|
21 |
import re
|
22 |
import time
|
23 |
|
24 |
+
import subprocess
|
25 |
+
import shlex
|
26 |
+
|
27 |
from pytube import YouTube
|
28 |
import torch
|
29 |
|
|
|
31 |
#### <p>There are many not very widely spoken languages for which it is quite hard to find learning materials,
|
32 |
especially well dubbed videos (target language video with target language subs).
|
33 |
This tool will hopefully transcribe and add subs to your videos.
|
34 |
+
At least for me this is a nice tool to practice both listening and reading skills.
|
35 |
+
This is a 'one-click' variant of similar spaces found here on the HF hub.
|
36 |
<p>Speech Recognition is based on models from OpenAI Whisper - https://github.com/openai/whisper
|
37 |
<p> This space is using the c++ implementation by https://github.com/ggerganov/whisper.cpp
|
38 |
'''
|
39 |
|
40 |
+
|
41 |
whisper_models = MODELS_TO_DOWNLOAD #["medium"]#["base", "small", "medium", "large", "base.en"]
|
42 |
|
43 |
custom_models = []
|
|
|
47 |
|
48 |
LANGUAGES = {
|
49 |
"bg": "Bulgarian",
|
50 |
+
"en": "English",
|
51 |
+
"zh": "Chinese",
|
52 |
+
"de": "German",
|
53 |
+
"es": "Spanish",
|
54 |
+
"ru": "Russian",
|
55 |
+
"ko": "Korean",
|
56 |
+
"fr": "French",
|
57 |
+
"ja": "Japanese",
|
58 |
+
"pt": "Portuguese",
|
59 |
+
"tr": "Turkish",
|
60 |
+
"pl": "Polish",
|
61 |
+
"ca": "Catalan",
|
62 |
+
"nl": "Dutch",
|
63 |
+
"ar": "Arabic",
|
64 |
+
"sv": "Swedish",
|
65 |
+
"it": "Italian",
|
66 |
+
"id": "Indonesian",
|
67 |
+
"hi": "Hindi",
|
68 |
+
"fi": "Finnish",
|
69 |
+
"vi": "Vietnamese",
|
70 |
+
"he": "Hebrew",
|
71 |
+
"uk": "Ukrainian",
|
72 |
+
"el": "Greek",
|
73 |
+
"ms": "Malay",
|
74 |
+
"cs": "Czech",
|
75 |
+
"ro": "Romanian",
|
76 |
+
"da": "Danish",
|
77 |
+
"hu": "Hungarian",
|
78 |
+
"ta": "Tamil",
|
79 |
+
"no": "Norwegian",
|
80 |
+
"th": "Thai",
|
81 |
+
"ur": "Urdu",
|
82 |
+
"hr": "Croatian",
|
83 |
+
"lt": "Lithuanian",
|
84 |
+
"la": "Latin",
|
85 |
+
"mi": "Maori",
|
86 |
+
"ml": "Malayalam",
|
87 |
+
"cy": "Welsh",
|
88 |
+
"sk": "Slovak",
|
89 |
+
"te": "Telugu",
|
90 |
+
"fa": "Persian",
|
91 |
+
"lv": "Latvian",
|
92 |
+
"bn": "Bengali",
|
93 |
+
"sr": "Serbian",
|
94 |
+
"az": "Azerbaijani",
|
95 |
+
"sl": "Slovenian",
|
96 |
+
"kn": "Kannada",
|
97 |
+
"et": "Estonian",
|
98 |
+
"mk": "Macedonian",
|
99 |
+
"br": "Breton",
|
100 |
+
"eu": "Basque",
|
101 |
+
"is": "Icelandic",
|
102 |
+
"hy": "Armenian",
|
103 |
+
"ne": "Nepali",
|
104 |
+
"mn": "Mongolian",
|
105 |
+
"bs": "Bosnian",
|
106 |
+
"kk": "Kazakh",
|
107 |
+
"sq": "Albanian",
|
108 |
+
"sw": "Swahili",
|
109 |
+
"gl": "Galician",
|
110 |
+
"mr": "Marathi",
|
111 |
+
"pa": "Punjabi",
|
112 |
+
"si": "Sinhala",
|
113 |
+
"km": "Khmer",
|
114 |
+
"sn": "Shona",
|
115 |
+
"yo": "Yoruba",
|
116 |
+
"so": "Somali",
|
117 |
+
"af": "Afrikaans",
|
118 |
+
"oc": "Occitan",
|
119 |
+
"ka": "Georgian",
|
120 |
+
"be": "Belarusian",
|
121 |
+
"tg": "Tajik",
|
122 |
+
"sd": "Sindhi",
|
123 |
+
"gu": "Gujarati",
|
124 |
+
"am": "Amharic",
|
125 |
+
"yi": "Yiddish",
|
126 |
+
"lo": "Lao",
|
127 |
+
"uz": "Uzbek",
|
128 |
+
"fo": "Faroese",
|
129 |
+
"ht": "Haitian creole",
|
130 |
+
"ps": "Pashto",
|
131 |
+
"tk": "Turkmen",
|
132 |
+
"nn": "Nynorsk",
|
133 |
+
"mt": "Maltese",
|
134 |
+
"sa": "Sanskrit",
|
135 |
+
"lb": "Luxembourgish",
|
136 |
+
"my": "Myanmar",
|
137 |
+
"bo": "Tibetan",
|
138 |
+
"tl": "Tagalog",
|
139 |
+
"mg": "Malagasy",
|
140 |
+
"as": "Assamese",
|
141 |
+
"tt": "Tatar",
|
142 |
+
"haw": "Hawaiian",
|
143 |
+
"ln": "Lingala",
|
144 |
+
"ha": "Hausa",
|
145 |
+
"ba": "Bashkir",
|
146 |
+
"jw": "Javanese",
|
147 |
+
"su": "Sundanese",
|
148 |
}
|
149 |
|
150 |
# language code lookup by name, with a few language aliases
|
|
|
163 |
print(f"Download complete - {abs_video_path}")
|
164 |
return abs_video_path
|
165 |
|
166 |
+
def run_command(command, app_state):
|
167 |
+
print(command)
|
168 |
+
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
169 |
+
while process.poll() is None:
|
170 |
+
time.sleep(5)
|
171 |
+
output = process.stdout.readline()
|
172 |
+
if output == '' and process.poll() is not None:
|
173 |
+
break
|
174 |
+
if output:
|
175 |
+
decoded = output.decode()
|
176 |
+
print(decoded)
|
177 |
+
app_state['output'] += decoded
|
178 |
+
|
179 |
+
rc = process.poll()
|
180 |
+
print(f'{cmd} ret code is {rc}')
|
181 |
+
return rc
|
182 |
+
|
183 |
+
def speech_to_text(video_file_path,
|
184 |
+
selected_source_lang,
|
185 |
+
whisper_model,
|
186 |
+
app_state):
|
187 |
"""
|
188 |
Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
|
189 |
This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
|
|
|
213 |
try:
|
214 |
print("starting whisper c++")
|
215 |
os.system(f'rm -f {srt_path}')
|
216 |
+
run_command(f'./whisper.cpp/main "{input_wav_file}" -t {os.cpu_count()} -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt -ovtt',
|
217 |
+
app_state)
|
218 |
+
# os.system(f'./whisper.cpp/main "{input_wav_file}" -t {os.cpu_count()} -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt -ovtt')
|
219 |
print("whisper c++ finished")
|
220 |
except Exception as e:
|
221 |
raise RuntimeError("Error running Whisper cpp model")
|
|
|
273 |
video_player = gr.HTML('<p>video will be played here')
|
274 |
eventslider = gr.Slider(visible=False)
|
275 |
status_msg = gr.Markdown('Status')
|
276 |
+
output_label = gr.Textbox('', interactive=False, show_label=False)
|
277 |
|
278 |
demo = gr.Blocks()
|
279 |
demo.encrypt = False
|
|
|
282 |
app_state['status_msg'] = msg
|
283 |
|
284 |
def transcribe(app_state, youtube_url_in, selected_source_lang, selected_whisper_model):
|
285 |
+
app_state['output'] = ''
|
286 |
set_app_msg(app_state, 'Downloading the movie ...')
|
287 |
video_file_path = get_youtube(youtube_url_in)
|
288 |
set_app_msg(app_state, f'Running the speech to text model {selected_source_lang}/{selected_whisper_model}. This can take some time.')
|
289 |
+
subtitle_files = speech_to_text(video_file_path, selected_source_lang, selected_whisper_model, app_state)
|
290 |
set_app_msg(app_state, f'Creating the video player ...')
|
291 |
video_player = create_video_player(subtitle_files, video_file_path)
|
292 |
+
set_app_msg(app_state, f'Transcribing done, generating video player')
|
293 |
return subtitle_files, video_player
|
294 |
|
295 |
|
296 |
def on_change_event(app_state):
|
297 |
+
print(f'Running! {app_state}')
|
298 |
+
return app_state['status_msg'], app_state['output']
|
299 |
|
300 |
with demo:
|
301 |
app_state = gr.State({
|
302 |
+
'running': False,
|
303 |
+
'status_msg': '',
|
304 |
+
'output': ''
|
305 |
})
|
306 |
|
307 |
with gr.Row():
|
|
|
323 |
|
324 |
eventslider.render()
|
325 |
status_msg.render()
|
326 |
+
output_label.render()
|
327 |
subtitle_files.render()
|
328 |
video_player.render()
|
329 |
with gr.Row():
|
330 |
gr.Markdown('This app is based on [this code](https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles/tree/main) by RASMUS.')
|
331 |
|
332 |
+
dep = demo.load(on_change_event, inputs=[app_state], outputs=[status_msg, output_label], every=10)
|
333 |
|
334 |
|
335 |
#### RUN ###
|