Spaces:
Build error
Build error
Alex Volkov
commited on
Commit
•
7db5fdc
1
Parent(s):
09cee30
Added captions API, that receives a URL and both transcribes AND translates it.
Browse files- app.py +1 -1
- download.py +68 -15
- requirements.txt +2 -1
- static/css/main.css +1 -1
- utils/apis.py +32 -5
- utils/subs.py +33 -25
app.py
CHANGED
@@ -137,7 +137,7 @@ with gr.Blocks(css='@import "file=static/css/main.css";', theme='darkpeach', tit
|
|
137 |
init_video.change(fn=init_video_manual_upload, inputs=[url_input, init_video], outputs=[])
|
138 |
|
139 |
# Render imported buttons for API bindings
|
140 |
-
render_api_elements(url_input,download_status, output_text, sub_video)
|
141 |
|
142 |
queue_placeholder = demo.queue()
|
143 |
|
|
|
137 |
init_video.change(fn=init_video_manual_upload, inputs=[url_input, init_video], outputs=[])
|
138 |
|
139 |
# Render imported buttons for API bindings
|
140 |
+
render_api_elements(url_input,download_status, output_text, sub_video, output_file)
|
141 |
|
142 |
queue_placeholder = demo.queue()
|
143 |
|
download.py
CHANGED
@@ -13,7 +13,7 @@ import argparse
|
|
13 |
import whisper
|
14 |
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
|
15 |
import ffmpeg
|
16 |
-
from utils.subs import bake_subs
|
17 |
from utils.utils import get_args
|
18 |
|
19 |
original_dir = os.getcwd()
|
@@ -106,6 +106,54 @@ def download_generator(url, translate_action=True, source_language='Autodetect',
|
|
106 |
yield {"message": f"{e}"}
|
107 |
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
def progress_hook(d):
|
110 |
if d['status'] == 'downloading':
|
111 |
print("downloading " + str(round(float(d['downloaded_bytes']) / float(d['total_bytes']) * 100, 1)) + "%")
|
@@ -115,11 +163,11 @@ def progress_hook(d):
|
|
115 |
print(filename)
|
116 |
yield f"Downloaded {filename}"
|
117 |
|
118 |
-
def download(url, tempdir):
|
119 |
try:
|
120 |
ydl_opts = {
|
121 |
-
"format":
|
122 |
-
"keepvideo":
|
123 |
'postprocessors': [{
|
124 |
'key': 'FFmpegExtractAudio',
|
125 |
'preferredcodec': 'mp3',
|
@@ -128,7 +176,7 @@ def download(url, tempdir):
|
|
128 |
"skip_download": False,
|
129 |
"outtmpl": f"{tempdir}/%(id)s.%(ext)s",
|
130 |
"noplaylist": True,
|
131 |
-
"verbose":
|
132 |
"quiet": True,
|
133 |
"progress_hooks": [progress_hook],
|
134 |
|
@@ -141,10 +189,13 @@ def download(url, tempdir):
|
|
141 |
except DownloadError as e:
|
142 |
raise e
|
143 |
else:
|
144 |
-
video = tempdir / f"{meta['id']}.{meta['ext']}"
|
145 |
audio = tempdir / f"{meta['id']}.mp3"
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
148 |
|
149 |
def check_download(url):
|
150 |
ydl_opts = {
|
@@ -164,22 +215,24 @@ def check_download(url):
|
|
164 |
else:
|
165 |
return meta
|
166 |
|
167 |
-
def transcribe(audio, translate_action=True, language='Autodetect'):
|
168 |
task = "translate" if translate_action else "transcribe"
|
169 |
-
|
|
|
170 |
global model
|
171 |
-
if not preload_model:
|
172 |
-
model = whisper.load_model(
|
|
|
173 |
props = {
|
174 |
"task": task,
|
175 |
}
|
|
|
176 |
if language != 'Autodetect':
|
177 |
props["language"] = TO_LANGUAGE_CODE[language.lower()]
|
178 |
|
179 |
-
output = model.transcribe(audio,
|
180 |
|
181 |
-
output[
|
182 |
-
output['segments'] = [{"id": 0, "seek": 0, "start": 0.0, "end": 3, "text": " [AI transcription]"}] + output['segments']
|
183 |
print(f'Finished transcribe from {output["language"]}', output["text"])
|
184 |
return output
|
185 |
|
|
|
13 |
import whisper
|
14 |
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
|
15 |
import ffmpeg
|
16 |
+
from utils.subs import bake_subs, get_srt
|
17 |
from utils.utils import get_args
|
18 |
|
19 |
original_dir = os.getcwd()
|
|
|
106 |
yield {"message": f"{e}"}
|
107 |
|
108 |
|
109 |
+
def caption_generator(tweet_url, language="Autodetect", model_size=model_size):
|
110 |
+
# Download the file
|
111 |
+
|
112 |
+
try:
|
113 |
+
print(f"Downloading {tweet_url} ")
|
114 |
+
meta = check_download(tweet_url)
|
115 |
+
tempdir = output_dir / f"{meta['id']}"
|
116 |
+
print(f"Downloaded {meta['id']}.mp3 from {meta['uploader_id']} and url {meta['webpage_url']}")
|
117 |
+
except Exception as e:
|
118 |
+
print(f"Could not download file: {e}")
|
119 |
+
raise
|
120 |
+
|
121 |
+
try:
|
122 |
+
print(f"Starting audio only download with URL {tweet_url}, this may take a while")
|
123 |
+
meta, video, audio = download(tweet_url, tempdir, keepVideo=False)
|
124 |
+
print(f"Downloaded video and extracted audio")
|
125 |
+
except Exception as e:
|
126 |
+
print(f"Could not download file: {e}")
|
127 |
+
raise
|
128 |
+
|
129 |
+
# Run whisper on the audio with language unless auto
|
130 |
+
try:
|
131 |
+
print(f"Starting whisper transcribe with {meta['id']}.mp3")
|
132 |
+
transcribe_whisper_result = transcribe(audio, translate_action=False, language=language, override_model_size=model_size)
|
133 |
+
translate_whisper_result = transcribe(audio, translate_action=True, language=language, override_model_size=model_size)
|
134 |
+
srt = get_srt(transcribe_whisper_result["segments"])
|
135 |
+
en_srt = get_srt(translate_whisper_result["segments"])
|
136 |
+
|
137 |
+
print(f"Transcribe successful!")
|
138 |
+
except Exception as e:
|
139 |
+
print(f"Could not transcribe file: {e}")
|
140 |
+
return
|
141 |
+
|
142 |
+
return_dict = {
|
143 |
+
"detected_language": LANGUAGES[transcribe_whisper_result["language"]],
|
144 |
+
"requested_language": language,
|
145 |
+
"text": transcribe_whisper_result["text"],
|
146 |
+
"en_text": translate_whisper_result["text"],
|
147 |
+
"srt": srt,
|
148 |
+
"en_srt": en_srt,
|
149 |
+
"meta": meta,
|
150 |
+
}
|
151 |
+
return return_dict
|
152 |
+
|
153 |
+
|
154 |
+
# Run whisper with translation task enabled (and save to different srt file)
|
155 |
+
# Call anvil background task with both files, and both the plain texts
|
156 |
+
|
157 |
def progress_hook(d):
|
158 |
if d['status'] == 'downloading':
|
159 |
print("downloading " + str(round(float(d['downloaded_bytes']) / float(d['total_bytes']) * 100, 1)) + "%")
|
|
|
163 |
print(filename)
|
164 |
yield f"Downloaded {filename}"
|
165 |
|
166 |
+
def download(url, tempdir, format="bestvideo[ext=mp4]+bestaudio/best", verbose=False, keepVideo=True):
|
167 |
try:
|
168 |
ydl_opts = {
|
169 |
+
"format": format,
|
170 |
+
"keepvideo": keepVideo,
|
171 |
'postprocessors': [{
|
172 |
'key': 'FFmpegExtractAudio',
|
173 |
'preferredcodec': 'mp3',
|
|
|
176 |
"skip_download": False,
|
177 |
"outtmpl": f"{tempdir}/%(id)s.%(ext)s",
|
178 |
"noplaylist": True,
|
179 |
+
"verbose": verbose,
|
180 |
"quiet": True,
|
181 |
"progress_hooks": [progress_hook],
|
182 |
|
|
|
189 |
except DownloadError as e:
|
190 |
raise e
|
191 |
else:
|
|
|
192 |
audio = tempdir / f"{meta['id']}.mp3"
|
193 |
+
if (keepVideo):
|
194 |
+
video = tempdir / f"{meta['id']}.{meta['ext']}"
|
195 |
+
return meta, str(video.resolve()), str(audio.resolve())
|
196 |
+
else:
|
197 |
+
return meta, None, str(audio.resolve())
|
198 |
+
|
199 |
|
200 |
def check_download(url):
|
201 |
ydl_opts = {
|
|
|
215 |
else:
|
216 |
return meta
|
217 |
|
218 |
+
def transcribe(audio, translate_action=True, language='Autodetect', override_model_size=''):
|
219 |
task = "translate" if translate_action else "transcribe"
|
220 |
+
model_size_to_load = override_model_size if override_model_size else model_size
|
221 |
+
print(f'Starting {task} with whisper size {model_size_to_load} on {audio}')
|
222 |
global model
|
223 |
+
if not preload_model or model_size != override_model_size:
|
224 |
+
model = whisper.load_model(model_size_to_load)
|
225 |
+
|
226 |
props = {
|
227 |
"task": task,
|
228 |
}
|
229 |
+
|
230 |
if language != 'Autodetect':
|
231 |
props["language"] = TO_LANGUAGE_CODE[language.lower()]
|
232 |
|
233 |
+
output = model.transcribe(audio, verbose=True, **props)
|
234 |
|
235 |
+
output['segments'] = output['segments']
|
|
|
236 |
print(f'Finished transcribe from {output["language"]}', output["text"])
|
237 |
return output
|
238 |
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ anvil-uplink==0.4.0
|
|
4 |
gradio==3.4.0
|
5 |
python-dotenv==0.21.0
|
6 |
aiohttp==3.8.3
|
7 |
-
aiohttp-requests==0.1.3
|
|
|
|
4 |
gradio==3.4.0
|
5 |
python-dotenv==0.21.0
|
6 |
aiohttp==3.8.3
|
7 |
+
aiohttp-requests==0.1.3
|
8 |
+
fsspec=2022.8.2
|
static/css/main.css
CHANGED
@@ -93,5 +93,5 @@ background: transparent
|
|
93 |
}
|
94 |
|
95 |
footer{
|
96 |
-
display: none !important
|
97 |
}
|
|
|
93 |
}
|
94 |
|
95 |
footer{
|
96 |
+
/*display: none !important;*/
|
97 |
}
|
utils/apis.py
CHANGED
@@ -11,10 +11,11 @@ import anvil.media
|
|
11 |
import dotenv
|
12 |
import gradio as gr
|
13 |
import requests
|
14 |
-
from download import download_generator
|
15 |
-
|
16 |
|
17 |
dotenv.load_dotenv()
|
|
|
|
|
18 |
@anvil.server.callable
|
19 |
def call_gradio_api(api_name='test_api', data=()):
|
20 |
port = os.environ.get('SERVER_PORT', 8111)
|
@@ -62,7 +63,19 @@ def test_api(url=''):
|
|
62 |
# TODO: add an anvil server pingback to show we completed the queue operation
|
63 |
return f"I've slept for 15 seconds and now I'm done. "
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
with gr.Group(elem_id='fake_ass_group') as api_buttons:
|
67 |
# This is a hack to get APIs registered with the blocks interface
|
68 |
translate_result = gr.Textbox(visible=False)
|
@@ -75,6 +88,21 @@ def render_api_elements(url_input, download_status, output_text, sub_video):
|
|
75 |
|
76 |
gr.Button("remote_download", visible=False)\
|
77 |
.click(api_name='remote_download', queue=True, fn=remote_download, inputs=[url_input], outputs=[download_status, output_text, translate_result, translate_language])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
return api_buttons
|
79 |
|
80 |
|
@@ -87,5 +115,4 @@ def cleanup_output_dir():
|
|
87 |
if path.is_file():
|
88 |
path.unlink()
|
89 |
elif path.is_dir():
|
90 |
-
rmtree(path)
|
91 |
-
|
|
|
11 |
import dotenv
|
12 |
import gradio as gr
|
13 |
import requests
|
14 |
+
from download import download_generator, caption_generator
|
|
|
15 |
|
16 |
dotenv.load_dotenv()
|
17 |
+
|
18 |
+
|
19 |
@anvil.server.callable
|
20 |
def call_gradio_api(api_name='test_api', data=()):
|
21 |
port = os.environ.get('SERVER_PORT', 8111)
|
|
|
63 |
# TODO: add an anvil server pingback to show we completed the queue operation
|
64 |
return f"I've slept for 15 seconds and now I'm done. "
|
65 |
|
66 |
+
#TODO: add telegram error handler here
|
67 |
+
def caption(tweet_url="", language="Autodetect", override_model_size=""):
|
68 |
+
"""
|
69 |
+
:param media_id: The twitter media ID object
|
70 |
+
:param user_id_str: The twitter user ID string
|
71 |
+
:param tweet_url: tweet URL can potentially not exist in the future, so we can upload on behalf of the user
|
72 |
+
:return:
|
73 |
+
"""
|
74 |
+
response = caption_generator(tweet_url, language, override_model_size)
|
75 |
+
return json.dumps(response)
|
76 |
+
|
77 |
+
|
78 |
+
def render_api_elements(url_input, download_status, output_text, sub_video, output_file):
|
79 |
with gr.Group(elem_id='fake_ass_group') as api_buttons:
|
80 |
# This is a hack to get APIs registered with the blocks interface
|
81 |
translate_result = gr.Textbox(visible=False)
|
|
|
88 |
|
89 |
gr.Button("remote_download", visible=False)\
|
90 |
.click(api_name='remote_download', queue=True, fn=remote_download, inputs=[url_input], outputs=[download_status, output_text, translate_result, translate_language])
|
91 |
+
|
92 |
+
# creating fake elements just make gradio, cause I can't define an API signature like a sane person
|
93 |
+
|
94 |
+
gr.Button("caption", visible=False)\
|
95 |
+
.click(api_name='caption',
|
96 |
+
queue=True,
|
97 |
+
fn=caption,
|
98 |
+
inputs=[
|
99 |
+
gr.Text(label='tweet_url'),
|
100 |
+
gr.Text(label='language (optional)'),
|
101 |
+
gr.Dropdown(label='Model Size', choices=['base', 'tiny', 'small', 'medium', 'large']),
|
102 |
+
],
|
103 |
+
outputs=[
|
104 |
+
gr.Text(label='response_json')
|
105 |
+
])
|
106 |
return api_buttons
|
107 |
|
108 |
|
|
|
115 |
if path.is_file():
|
116 |
path.unlink()
|
117 |
elif path.is_dir():
|
118 |
+
rmtree(path)
|
|
utils/subs.py
CHANGED
@@ -6,7 +6,6 @@ import os
|
|
6 |
from typing import Iterator, TextIO
|
7 |
|
8 |
|
9 |
-
|
10 |
def bake_subs(input_file, output_file, subs_file, fontsdir, translate_action):
|
11 |
print(f"Baking {subs_file} into video... {input_file} -> {output_file}")
|
12 |
|
@@ -30,39 +29,39 @@ def bake_subs(input_file, output_file, subs_file, fontsdir, translate_action):
|
|
30 |
fontstyle = f'Fontsize={sub_size},OutlineColour=&H40000000,BorderStyle=3,FontName={fontname},Bold=1'
|
31 |
(
|
32 |
ffmpeg.concat(
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
)
|
40 |
|
41 |
|
42 |
def str2bool(string):
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
|
50 |
|
51 |
def format_timestamp(seconds: float, always_include_hours: bool = False):
|
52 |
-
|
53 |
-
|
54 |
|
55 |
-
|
56 |
-
|
57 |
|
58 |
-
|
59 |
-
|
60 |
|
61 |
-
|
62 |
-
|
63 |
|
64 |
-
|
65 |
-
|
66 |
|
67 |
|
68 |
def write_srt(transcript: Iterator[dict], file: TextIO):
|
@@ -77,8 +76,17 @@ def write_srt(transcript: Iterator[dict], file: TextIO):
|
|
77 |
)
|
78 |
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
def filename(path):
|
81 |
-
|
82 |
|
83 |
|
84 |
|
@@ -94,4 +102,4 @@ def filename(path):
|
|
94 |
# os.chdir(tempdirname)
|
95 |
# bake_subs(video_file_path, out_path, srt_path)
|
96 |
# anvil_media = anvil.media.from_file(out_path, 'video/mp4')
|
97 |
-
# print(anvil_media)
|
|
|
6 |
from typing import Iterator, TextIO
|
7 |
|
8 |
|
|
|
9 |
def bake_subs(input_file, output_file, subs_file, fontsdir, translate_action):
|
10 |
print(f"Baking {subs_file} into video... {input_file} -> {output_file}")
|
11 |
|
|
|
29 |
fontstyle = f'Fontsize={sub_size},OutlineColour=&H40000000,BorderStyle=3,FontName={fontname},Bold=1'
|
30 |
(
|
31 |
ffmpeg.concat(
|
32 |
+
video.filter('subtitles', subs_file, fontsdir=fontfile, force_style=fontstyle),
|
33 |
+
audio, v=1, a=1
|
34 |
+
)
|
35 |
+
.overlay(watermark.filter('scale', iw / 3, -1), x='10', y='10')
|
36 |
+
.output(filename=output_file)
|
37 |
+
.run(quiet=True, overwrite_output=True)
|
38 |
)
|
39 |
|
40 |
|
41 |
def str2bool(string):
|
42 |
+
str2val = {"True": True, "False": False}
|
43 |
+
if string in str2val:
|
44 |
+
return str2val[string]
|
45 |
+
else:
|
46 |
+
raise ValueError(
|
47 |
+
f"Expected one of {set(str2val.keys())}, got {string}")
|
48 |
|
49 |
|
50 |
def format_timestamp(seconds: float, always_include_hours: bool = False):
|
51 |
+
assert seconds >= 0, "non-negative timestamp expected"
|
52 |
+
milliseconds = round(seconds * 1000.0)
|
53 |
|
54 |
+
hours = milliseconds // 3_600_000
|
55 |
+
milliseconds -= hours * 3_600_000
|
56 |
|
57 |
+
minutes = milliseconds // 60_000
|
58 |
+
milliseconds -= minutes * 60_000
|
59 |
|
60 |
+
seconds = milliseconds // 1_000
|
61 |
+
milliseconds -= seconds * 1_000
|
62 |
|
63 |
+
hours_marker = f"{hours}:" if always_include_hours or hours > 0 else ""
|
64 |
+
return f"{hours_marker}{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
|
65 |
|
66 |
|
67 |
def write_srt(transcript: Iterator[dict], file: TextIO):
|
|
|
76 |
)
|
77 |
|
78 |
|
79 |
+
def get_srt(transcript: Iterator[dict]):
|
80 |
+
srt = ''
|
81 |
+
for i, segment in enumerate(transcript, start=1):
|
82 |
+
srt += f"{i}\n" \
|
83 |
+
f"{format_timestamp(segment['start'], always_include_hours=True)} --> " \
|
84 |
+
f"{format_timestamp(segment['end'], always_include_hours=True)}\n" \
|
85 |
+
f"{segment['text'].strip().replace('-->', '->')}\n"
|
86 |
+
return srt
|
87 |
+
|
88 |
def filename(path):
|
89 |
+
return os.path.splitext(os.path.basename(path))[0]
|
90 |
|
91 |
|
92 |
|
|
|
102 |
# os.chdir(tempdirname)
|
103 |
# bake_subs(video_file_path, out_path, srt_path)
|
104 |
# anvil_media = anvil.media.from_file(out_path, 'video/mp4')
|
105 |
+
# print(anvil_media)
|