Spaces:
Running
Running
Initial Commit
Browse files- app.py +392 -0
- languages.py +147 -0
- packages.txt +42 -0
- requirements.txt +5 -0
- subtitle.py +101 -0
app.py
ADDED
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import yt_dlp as youtube_dl
|
5 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
|
6 |
+
from transformers.pipelines.audio_utils import ffmpeg_read
|
7 |
+
|
8 |
+
import tempfile
|
9 |
+
import os
|
10 |
+
import time
|
11 |
+
import requests
|
12 |
+
from playwright.sync_api import sync_playwright
|
13 |
+
|
14 |
+
from languages import get_language_names
|
15 |
+
from subtitle import text_output, subtitle_output
|
16 |
+
|
17 |
+
import subprocess
|
18 |
+
|
19 |
+
try:
|
20 |
+
import spaces
|
21 |
+
USING_SPACES = True
|
22 |
+
except ImportError:
|
23 |
+
USING_SPACES = False
|
24 |
+
|
25 |
+
subprocess.run(
|
26 |
+
"pip install flash-attn --no-build-isolation",
|
27 |
+
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
|
28 |
+
shell=True,
|
29 |
+
)
|
30 |
+
|
31 |
+
os.system("playwright install")
|
32 |
+
|
33 |
+
YT_LENGTH_LIMIT_S = 360
|
34 |
+
SPACES_GPU_DURATION = 90
|
35 |
+
|
36 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
37 |
+
|
38 |
+
def gpu_decorator(duration=60):
|
39 |
+
def actual_decorator(func):
|
40 |
+
if USING_SPACES:
|
41 |
+
return spaces.GPU(duration=duration)(func)
|
42 |
+
return func
|
43 |
+
return actual_decorator
|
44 |
+
|
45 |
+
def device_info():
|
46 |
+
try:
|
47 |
+
subprocess.run(["df", "-h"], check=True)
|
48 |
+
subprocess.run(["lsblk"], check=True)
|
49 |
+
subprocess.run(["free", "-h"], check=True)
|
50 |
+
subprocess.run(["lscpu"], check=True)
|
51 |
+
subprocess.run(["nvidia-smi"], check=True)
|
52 |
+
except subprocess.CalledProcessError as e:
|
53 |
+
print(f"Command failed: {e}")
|
54 |
+
|
55 |
+
@gpu_decorator(duration=SPACES_GPU_DURATION)
|
56 |
+
def transcribe(inputs, model, language, batch_size, chunk_length_s, stride_length_s, task, timestamp_mode, progress=gr.Progress(track_tqdm=True)):
|
57 |
+
try:
|
58 |
+
if inputs is None:
|
59 |
+
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
60 |
+
|
61 |
+
torch_dtype = torch.float16
|
62 |
+
|
63 |
+
model_gen = AutoModelForSpeechSeq2Seq.from_pretrained(
|
64 |
+
model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
65 |
+
)
|
66 |
+
model_gen.to(device)
|
67 |
+
|
68 |
+
processor = AutoProcessor.from_pretrained(model)
|
69 |
+
tokenizer = WhisperTokenizer.from_pretrained(model)
|
70 |
+
|
71 |
+
pipe = pipeline(
|
72 |
+
task="automatic-speech-recognition",
|
73 |
+
model=model_gen,
|
74 |
+
chunk_length_s=chunk_length_s,
|
75 |
+
stride_length_s=stride_length_s,
|
76 |
+
tokenizer=tokenizer,
|
77 |
+
feature_extractor=processor.feature_extractor,
|
78 |
+
torch_dtype=torch_dtype,
|
79 |
+
model_kwargs={"attn_implementation": "flash_attention_2"},
|
80 |
+
device=device,
|
81 |
+
)
|
82 |
+
|
83 |
+
generate_kwargs = {}
|
84 |
+
if language != "Automatic Detection" and model.endswith(".en") == False:
|
85 |
+
generate_kwargs["language"] = language
|
86 |
+
if model.endswith(".en") == False:
|
87 |
+
generate_kwargs["task"] = task
|
88 |
+
|
89 |
+
output = pipe(inputs, batch_size=batch_size, generate_kwargs=generate_kwargs, return_timestamps=timestamp_mode)
|
90 |
+
|
91 |
+
print(output)
|
92 |
+
print({"inputs": inputs, "model": model, "language": language, "batch_size": batch_size, "chunk_length_s": chunk_length_s, "stride_length_s": stride_length_s, "task": task, "timestamp_mode": timestamp_mode})
|
93 |
+
|
94 |
+
if not timestamp_mode:
|
95 |
+
text = output['text']
|
96 |
+
return text_output(inputs, text)
|
97 |
+
else:
|
98 |
+
chunks = output['chunks']
|
99 |
+
return subtitle_output(inputs, chunks)
|
100 |
+
|
101 |
+
except Exception as e:
|
102 |
+
error_message = str(e)
|
103 |
+
raise gr.Error(error_message, duration=10)
|
104 |
+
|
105 |
+
def _download_yt_audio(yt_url, filename):
|
106 |
+
info_loader = youtube_dl.YoutubeDL()
|
107 |
+
|
108 |
+
try:
|
109 |
+
info = info_loader.extract_info(yt_url, download=False)
|
110 |
+
except youtube_dl.utils.DownloadError as err:
|
111 |
+
raise gr.Error(str(err))
|
112 |
+
|
113 |
+
file_length = info.get("duration_string")
|
114 |
+
if not file_length:
|
115 |
+
raise gr.Error("Video duration is unavailable.")
|
116 |
+
|
117 |
+
file_h_m_s = file_length.split(":")
|
118 |
+
file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
|
119 |
+
|
120 |
+
if len(file_h_m_s) == 1:
|
121 |
+
file_h_m_s.insert(0, 0)
|
122 |
+
if len(file_h_m_s) == 2:
|
123 |
+
file_h_m_s.insert(0, 0)
|
124 |
+
|
125 |
+
file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
|
126 |
+
|
127 |
+
if file_length_s > YT_LENGTH_LIMIT_S:
|
128 |
+
yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
|
129 |
+
file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
|
130 |
+
raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.", duration=10)
|
131 |
+
|
132 |
+
try:
|
133 |
+
ydl_opts = {
|
134 |
+
"outtmpl": filename,
|
135 |
+
"format": "bestaudio[ext=m4a]/best",
|
136 |
+
}
|
137 |
+
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
138 |
+
ydl.download([yt_url])
|
139 |
+
except youtube_dl.utils.ExtractorError as err:
|
140 |
+
available_formats = info_loader.extract_info(yt_url, download=False)['formats']
|
141 |
+
raise gr.Error(f"Requested format not available. Available formats: {available_formats}", duration=10)
|
142 |
+
|
143 |
+
def _return_yt_video_id(yt_url):
|
144 |
+
if "https://www.youtube.com/watch?v=" in yt_url:
|
145 |
+
video_id = yt_url.split("?v=")[-1]
|
146 |
+
elif "https://youtu.be/" in yt_url:
|
147 |
+
video_id = yt_url.split("be/")[1]
|
148 |
+
return video_id
|
149 |
+
|
150 |
+
def _return_yt_html_embed(yt_url):
|
151 |
+
video_id = _return_yt_video_id(yt_url)
|
152 |
+
HTML_str = (
|
153 |
+
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
|
154 |
+
" </center>"
|
155 |
+
)
|
156 |
+
return HTML_str
|
157 |
+
|
158 |
+
def _return_yt_thumbnail(yt_url):
|
159 |
+
video_id = _return_yt_video_id(yt_url)
|
160 |
+
if not video_id:
|
161 |
+
raise ValueError("Invalid YouTube URL: Unable to extract video ID.")
|
162 |
+
thumbnail_url = f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg"
|
163 |
+
thumbnail_path = None
|
164 |
+
try:
|
165 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
|
166 |
+
response = requests.get(thumbnail_url)
|
167 |
+
if response.status_code == 200:
|
168 |
+
temp_file.write(response.content)
|
169 |
+
thumbnail_path = temp_file.name
|
170 |
+
else:
|
171 |
+
raise Exception(f"Failed to retrieve thumbnail. Status code: {response.status_code}")
|
172 |
+
except Exception as e:
|
173 |
+
print(f"Error occurred: {e}")
|
174 |
+
return None
|
175 |
+
return thumbnail_path
|
176 |
+
|
177 |
+
def _return_yt_info(yt_url):
|
178 |
+
video_id = _return_yt_video_id(yt_url)
|
179 |
+
try:
|
180 |
+
with sync_playwright() as p:
|
181 |
+
browser = p.chromium.launch(headless=True)
|
182 |
+
page = browser.new_page()
|
183 |
+
|
184 |
+
page.goto(yt_url)
|
185 |
+
|
186 |
+
page.wait_for_load_state("networkidle")
|
187 |
+
|
188 |
+
title = page.title()
|
189 |
+
description = page.query_selector("meta[name='description']").get_attribute("content")
|
190 |
+
keywords = page.query_selector("meta[name='keywords']").get_attribute("content")
|
191 |
+
|
192 |
+
gr_title = gr.Textbox(label="YouTube Title", visible=True, value=title)
|
193 |
+
gr_description = gr.Textbox(label="YouTube Description", visible=True, value=description)
|
194 |
+
gr_keywords = gr.Textbox(label="YouTube Keywords", visible=True, value=keywords)
|
195 |
+
|
196 |
+
browser.close()
|
197 |
+
return gr_title, gr_description, gr_keywords
|
198 |
+
except Exception as e:
|
199 |
+
print(e)
|
200 |
+
return gr.Textbox(visible=False), gr.Textbox(visible=False), gr.Textbox(visible=False)
|
201 |
+
|
202 |
+
|
203 |
+
def return_youtube(yt_url):
|
204 |
+
html_embed_str = _return_yt_html_embed(yt_url)
|
205 |
+
thumbnail = _return_yt_thumbnail(yt_url)
|
206 |
+
gr_html = gr.HTML(label="Youtube Video", visible=True, value=html_embed_str)
|
207 |
+
gr_thumbnail = gr.Image(label="Youtube Thumbnail", visible=True, value=thumbnail)
|
208 |
+
gr_title, gr_description, gr_keywords = _return_yt_info(yt_url)
|
209 |
+
return gr_html, gr_thumbnail, gr_title, gr_description, gr_keywords
|
210 |
+
|
211 |
+
@gpu_decorator(duration=SPACES_GPU_DURATION)
|
212 |
+
def yt_transcribe(yt_url, model, language, batch_size, chunk_length_s, stride_length_s, task, timestamp_mode):
|
213 |
+
gr_html, gr_thumbnail, gr_title, gr_description, gr_keywords = return_youtube(yt_url)
|
214 |
+
try:
|
215 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
216 |
+
filepath = os.path.join(tmpdirname, "video.mp4")
|
217 |
+
_download_yt_audio(yt_url, filepath)
|
218 |
+
with open(filepath, "rb") as f:
|
219 |
+
inputs = f.read()
|
220 |
+
|
221 |
+
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
|
222 |
+
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
|
223 |
+
|
224 |
+
torch_dtype = torch.float16
|
225 |
+
|
226 |
+
model_gen = AutoModelForSpeechSeq2Seq.from_pretrained(
|
227 |
+
model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
228 |
+
)
|
229 |
+
model_gen.to(device)
|
230 |
+
|
231 |
+
processor = AutoProcessor.from_pretrained(model)
|
232 |
+
tokenizer = WhisperTokenizer.from_pretrained(model)
|
233 |
+
|
234 |
+
pipe = pipeline(
|
235 |
+
task="automatic-speech-recognition",
|
236 |
+
model=model_gen,
|
237 |
+
chunk_length_s=chunk_length_s,
|
238 |
+
stride_length_s=stride_length_s,
|
239 |
+
tokenizer=tokenizer,
|
240 |
+
feature_extractor=processor.feature_extractor,
|
241 |
+
torch_dtype=torch_dtype,
|
242 |
+
model_kwargs={"attn_implementation": "flash_attention_2"},
|
243 |
+
device=device,
|
244 |
+
)
|
245 |
+
|
246 |
+
generate_kwargs = {}
|
247 |
+
if language != "Automatic Detection" and model.endswith(".en") == False:
|
248 |
+
generate_kwargs["language"] = language
|
249 |
+
if model.endswith(".en") == False:
|
250 |
+
generate_kwargs["task"] = task
|
251 |
+
|
252 |
+
output = pipe(inputs, batch_size=batch_size, generate_kwargs=generate_kwargs, return_timestamps=timestamp_mode)
|
253 |
+
|
254 |
+
print(output)
|
255 |
+
print({"inputs": yt_url, "model": model, "language": language, "batch_size": batch_size, "chunk_length_s": chunk_length_s, "stride_length_s": stride_length_s, "task": task, "timestamp_mode": timestamp_mode})
|
256 |
+
|
257 |
+
if not timestamp_mode:
|
258 |
+
text = output['text']
|
259 |
+
subtitle, files = text_output(inputs, text)
|
260 |
+
else:
|
261 |
+
chunks = output['chunks']
|
262 |
+
subtitle, files = subtitle_output(inputs, chunks)
|
263 |
+
return subtitle, files, gr_title, gr_html, gr_thumbnail, gr_description, gr_keywords
|
264 |
+
|
265 |
+
except Exception as e:
|
266 |
+
error_message = str(e)
|
267 |
+
gr.Warning(error_message, duration=10)
|
268 |
+
return gr.Textbox(visible=False),gr.Textbox(visible=False), gr_title, gr_html, gr_thumbnail, gr_description, gr_keywords
|
269 |
+
|
270 |
+
demo = gr.Blocks()
|
271 |
+
|
272 |
+
file_transcribe = gr.Interface(
|
273 |
+
fn=transcribe,
|
274 |
+
inputs=[
|
275 |
+
gr.Audio(sources=['upload', 'microphone'], type="filepath", label="Audio file"),
|
276 |
+
gr.Dropdown(
|
277 |
+
choices=[
|
278 |
+
"openai/whisper-tiny",
|
279 |
+
"openai/whisper-base",
|
280 |
+
"openai/whisper-small",
|
281 |
+
"openai/whisper-medium",
|
282 |
+
"openai/whisper-large",
|
283 |
+
"openai/whisper-large-v1",
|
284 |
+
"openai/whisper-large-v2", "distil-whisper/distil-large-v2",
|
285 |
+
"openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2",
|
286 |
+
],
|
287 |
+
value="openai/whisper-large-v3-turbo",
|
288 |
+
label="Model Name",
|
289 |
+
allow_custom_value=True,
|
290 |
+
),
|
291 |
+
gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", interactive = True,),
|
292 |
+
gr.Slider(label="Batch Size", minimum=1, maximum=32, value=16, step=1),
|
293 |
+
gr.Slider(label="Chunk Length (s)", minimum=1, maximum=60, value=17.5, step=0.1),
|
294 |
+
gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
|
295 |
+
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
|
296 |
+
gr.Dropdown(
|
297 |
+
choices=[True, False, "word"],
|
298 |
+
value=True,
|
299 |
+
label="Timestamp Mode"
|
300 |
+
),
|
301 |
+
],
|
302 |
+
outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")],
|
303 |
+
title="Whisper: Transcribe Audio",
|
304 |
+
flagging_mode="auto",
|
305 |
+
)
|
306 |
+
|
307 |
+
video_transcribe = gr.Interface(
|
308 |
+
fn=transcribe,
|
309 |
+
inputs=[
|
310 |
+
gr.Video(sources=["upload", "webcam"], label="Video file", show_label=False, show_download_button=False, show_share_button=False, streaming=True),
|
311 |
+
gr.Dropdown(
|
312 |
+
choices=[
|
313 |
+
"openai/whisper-tiny",
|
314 |
+
"openai/whisper-base",
|
315 |
+
"openai/whisper-small",
|
316 |
+
"openai/whisper-medium",
|
317 |
+
"openai/whisper-large",
|
318 |
+
"openai/whisper-large-v1",
|
319 |
+
"openai/whisper-large-v2", "distil-whisper/distil-large-v2",
|
320 |
+
"openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2",
|
321 |
+
],
|
322 |
+
value="openai/whisper-large-v3-turbo",
|
323 |
+
label="Model Name",
|
324 |
+
allow_custom_value=True,
|
325 |
+
),
|
326 |
+
gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", interactive = True,),
|
327 |
+
gr.Slider(label="Batch Size", minimum=1, maximum=32, value=16, step=1),
|
328 |
+
gr.Slider(label="Chunk Length (s)", minimum=1, maximum=60, value=17.5, step=0.1),
|
329 |
+
gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
|
330 |
+
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
|
331 |
+
gr.Dropdown(
|
332 |
+
choices=[True, False, "word"],
|
333 |
+
value=True,
|
334 |
+
label="Timestamp Mode"
|
335 |
+
),
|
336 |
+
],
|
337 |
+
outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")],
|
338 |
+
title="Whisper: Transcribe Video",
|
339 |
+
flagging_mode="auto",
|
340 |
+
)
|
341 |
+
|
342 |
+
yt_transcribe = gr.Interface(
|
343 |
+
fn=yt_transcribe,
|
344 |
+
inputs=[
|
345 |
+
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
|
346 |
+
gr.Dropdown(
|
347 |
+
choices=[
|
348 |
+
"openai/whisper-tiny",
|
349 |
+
"openai/whisper-base",
|
350 |
+
"openai/whisper-small",
|
351 |
+
"openai/whisper-medium",
|
352 |
+
"openai/whisper-large",
|
353 |
+
"openai/whisper-large-v1",
|
354 |
+
"openai/whisper-large-v2", "distil-whisper/distil-large-v2",
|
355 |
+
"openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2",
|
356 |
+
],
|
357 |
+
value="openai/whisper-large-v3-turbo",
|
358 |
+
label="Model Name",
|
359 |
+
allow_custom_value=True,
|
360 |
+
),
|
361 |
+
gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", interactive = True,),
|
362 |
+
gr.Slider(label="Batch Size", minimum=1, maximum=32, value=16, step=1),
|
363 |
+
gr.Slider(label="Chunk Length (s)", minimum=1, maximum=60, value=17.5, step=0.1),
|
364 |
+
gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
|
365 |
+
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
|
366 |
+
gr.Dropdown(
|
367 |
+
choices=[True, False, "word"],
|
368 |
+
value=True,
|
369 |
+
label="Timestamp Mode"
|
370 |
+
),
|
371 |
+
],
|
372 |
+
outputs=[
|
373 |
+
gr.Textbox(label="Output"),
|
374 |
+
gr.File(label="Download Files"),
|
375 |
+
gr.Textbox(label="Youtube Title"),
|
376 |
+
gr.HTML(label="Youtube Video"),
|
377 |
+
gr.Image(label="Youtube Thumbnail"),
|
378 |
+
gr.Textbox(label="Youtube Description"),
|
379 |
+
gr.Textbox(label="Youtube Keywords"),
|
380 |
+
],
|
381 |
+
title="Whisper: Transcribe YouTube",
|
382 |
+
flagging_mode="auto",
|
383 |
+
)
|
384 |
+
|
385 |
+
with demo:
|
386 |
+
gr.TabbedInterface(
|
387 |
+
interface_list=[file_transcribe, video_transcribe, yt_transcribe],
|
388 |
+
tab_names=["Audio", "Video", "YouTube"]
|
389 |
+
)
|
390 |
+
|
391 |
+
if __name__ == "__main__":
|
392 |
+
demo.queue().launch(ssr_mode=False)
|
languages.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Language():
|
2 |
+
def __init__(self, code, name):
|
3 |
+
self.code = code
|
4 |
+
self.name = name
|
5 |
+
|
6 |
+
def __str__(self):
|
7 |
+
return "Language(code={}, name={})".format(self.code, self.name)
|
8 |
+
|
9 |
+
LANGUAGES = [
|
10 |
+
Language('en', 'English'),
|
11 |
+
Language('zh', 'Chinese'),
|
12 |
+
Language('de', 'German'),
|
13 |
+
Language('es', 'Spanish'),
|
14 |
+
Language('ru', 'Russian'),
|
15 |
+
Language('ko', 'Korean'),
|
16 |
+
Language('fr', 'French'),
|
17 |
+
Language('ja', 'Japanese'),
|
18 |
+
Language('pt', 'Portuguese'),
|
19 |
+
Language('tr', 'Turkish'),
|
20 |
+
Language('pl', 'Polish'),
|
21 |
+
Language('ca', 'Catalan'),
|
22 |
+
Language('nl', 'Dutch'),
|
23 |
+
Language('ar', 'Arabic'),
|
24 |
+
Language('sv', 'Swedish'),
|
25 |
+
Language('it', 'Italian'),
|
26 |
+
Language('id', 'Indonesian'),
|
27 |
+
Language('hi', 'Hindi'),
|
28 |
+
Language('fi', 'Finnish'),
|
29 |
+
Language('vi', 'Vietnamese'),
|
30 |
+
Language('he', 'Hebrew'),
|
31 |
+
Language('uk', 'Ukrainian'),
|
32 |
+
Language('el', 'Greek'),
|
33 |
+
Language('ms', 'Malay'),
|
34 |
+
Language('cs', 'Czech'),
|
35 |
+
Language('ro', 'Romanian'),
|
36 |
+
Language('da', 'Danish'),
|
37 |
+
Language('hu', 'Hungarian'),
|
38 |
+
Language('ta', 'Tamil'),
|
39 |
+
Language('no', 'Norwegian'),
|
40 |
+
Language('th', 'Thai'),
|
41 |
+
Language('ur', 'Urdu'),
|
42 |
+
Language('hr', 'Croatian'),
|
43 |
+
Language('bg', 'Bulgarian'),
|
44 |
+
Language('lt', 'Lithuanian'),
|
45 |
+
Language('la', 'Latin'),
|
46 |
+
Language('mi', 'Maori'),
|
47 |
+
Language('ml', 'Malayalam'),
|
48 |
+
Language('cy', 'Welsh'),
|
49 |
+
Language('sk', 'Slovak'),
|
50 |
+
Language('te', 'Telugu'),
|
51 |
+
Language('fa', 'Persian'),
|
52 |
+
Language('lv', 'Latvian'),
|
53 |
+
Language('bn', 'Bengali'),
|
54 |
+
Language('sr', 'Serbian'),
|
55 |
+
Language('az', 'Azerbaijani'),
|
56 |
+
Language('sl', 'Slovenian'),
|
57 |
+
Language('kn', 'Kannada'),
|
58 |
+
Language('et', 'Estonian'),
|
59 |
+
Language('mk', 'Macedonian'),
|
60 |
+
Language('br', 'Breton'),
|
61 |
+
Language('eu', 'Basque'),
|
62 |
+
Language('is', 'Icelandic'),
|
63 |
+
Language('hy', 'Armenian'),
|
64 |
+
Language('ne', 'Nepali'),
|
65 |
+
Language('mn', 'Mongolian'),
|
66 |
+
Language('bs', 'Bosnian'),
|
67 |
+
Language('kk', 'Kazakh'),
|
68 |
+
Language('sq', 'Albanian'),
|
69 |
+
Language('sw', 'Swahili'),
|
70 |
+
Language('gl', 'Galician'),
|
71 |
+
Language('mr', 'Marathi'),
|
72 |
+
Language('pa', 'Punjabi'),
|
73 |
+
Language('si', 'Sinhala'),
|
74 |
+
Language('km', 'Khmer'),
|
75 |
+
Language('sn', 'Shona'),
|
76 |
+
Language('yo', 'Yoruba'),
|
77 |
+
Language('so', 'Somali'),
|
78 |
+
Language('af', 'Afrikaans'),
|
79 |
+
Language('oc', 'Occitan'),
|
80 |
+
Language('ka', 'Georgian'),
|
81 |
+
Language('be', 'Belarusian'),
|
82 |
+
Language('tg', 'Tajik'),
|
83 |
+
Language('sd', 'Sindhi'),
|
84 |
+
Language('gu', 'Gujarati'),
|
85 |
+
Language('am', 'Amharic'),
|
86 |
+
Language('yi', 'Yiddish'),
|
87 |
+
Language('lo', 'Lao'),
|
88 |
+
Language('uz', 'Uzbek'),
|
89 |
+
Language('fo', 'Faroese'),
|
90 |
+
Language('ht', 'Haitian creole'),
|
91 |
+
Language('ps', 'Pashto'),
|
92 |
+
Language('tk', 'Turkmen'),
|
93 |
+
Language('nn', 'Nynorsk'),
|
94 |
+
Language('mt', 'Maltese'),
|
95 |
+
Language('sa', 'Sanskrit'),
|
96 |
+
Language('lb', 'Luxembourgish'),
|
97 |
+
Language('my', 'Myanmar'),
|
98 |
+
Language('bo', 'Tibetan'),
|
99 |
+
Language('tl', 'Tagalog'),
|
100 |
+
Language('mg', 'Malagasy'),
|
101 |
+
Language('as', 'Assamese'),
|
102 |
+
Language('tt', 'Tatar'),
|
103 |
+
Language('haw', 'Hawaiian'),
|
104 |
+
Language('ln', 'Lingala'),
|
105 |
+
Language('ha', 'Hausa'),
|
106 |
+
Language('ba', 'Bashkir'),
|
107 |
+
Language('jw', 'Javanese'),
|
108 |
+
Language('su', 'Sundanese')
|
109 |
+
]
|
110 |
+
|
111 |
+
_TO_LANGUAGE_CODE = {
|
112 |
+
**{language.code: language for language in LANGUAGES},
|
113 |
+
"burmese": "my",
|
114 |
+
"valencian": "ca",
|
115 |
+
"flemish": "nl",
|
116 |
+
"haitian": "ht",
|
117 |
+
"letzeburgesch": "lb",
|
118 |
+
"pushto": "ps",
|
119 |
+
"panjabi": "pa",
|
120 |
+
"moldavian": "ro",
|
121 |
+
"moldovan": "ro",
|
122 |
+
"sinhalese": "si",
|
123 |
+
"castilian": "es",
|
124 |
+
}
|
125 |
+
|
126 |
+
_FROM_LANGUAGE_NAME = {
|
127 |
+
**{language.name.lower(): language for language in LANGUAGES}
|
128 |
+
}
|
129 |
+
|
130 |
+
def get_language_from_code(language_code, default=None) -> Language:
|
131 |
+
"""Return the language name from the language code."""
|
132 |
+
return _TO_LANGUAGE_CODE.get(language_code, default)
|
133 |
+
|
134 |
+
def get_language_from_name(language, default=None) -> Language:
|
135 |
+
"""Return the language code from the language name."""
|
136 |
+
return _FROM_LANGUAGE_NAME.get(language.lower() if language else None, default)
|
137 |
+
|
138 |
+
def get_language_names():
|
139 |
+
"""Return a list of language names."""
|
140 |
+
return [language.name for language in LANGUAGES]
|
141 |
+
|
142 |
+
if __name__ == "__main__":
|
143 |
+
# Test lookup
|
144 |
+
print(get_language_from_code('en'))
|
145 |
+
print(get_language_from_name('English'))
|
146 |
+
|
147 |
+
print(get_language_names())
|
packages.txt
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ffmpeg
|
2 |
+
libnss3
|
3 |
+
libnspr4
|
4 |
+
libatk1.0-0
|
5 |
+
libatk-bridge2.0-0
|
6 |
+
libcups2
|
7 |
+
libxcomposite1
|
8 |
+
libxdamage1
|
9 |
+
libxrandr2
|
10 |
+
libgbm1
|
11 |
+
libpango-1.0-0
|
12 |
+
libpangocairo-1.0-0
|
13 |
+
libasound2
|
14 |
+
libxshmfence1
|
15 |
+
libx11-xcb1
|
16 |
+
libxext6
|
17 |
+
libxtst6
|
18 |
+
libxinerama1
|
19 |
+
libwayland-client0
|
20 |
+
libwayland-cursor0
|
21 |
+
libwayland-egl1
|
22 |
+
libdbus-1-3
|
23 |
+
libatspi2.0-0
|
24 |
+
libdrm2
|
25 |
+
libgtk-3-0
|
26 |
+
libgdk-pixbuf2.0-0
|
27 |
+
libgstreamer1.0-0
|
28 |
+
libwoff1
|
29 |
+
libgstreamer-plugins-base1.0-0
|
30 |
+
libgstreamer-gl1.0-0
|
31 |
+
libharfbuzz-icu0
|
32 |
+
libenchant-2-2
|
33 |
+
libsecret-1-0
|
34 |
+
libhyphen0
|
35 |
+
libmanette-0.2-0
|
36 |
+
libgles2
|
37 |
+
libgstreamer1.0-0
|
38 |
+
libgstreamer-plugins-base1.0-0
|
39 |
+
gstreamer1.0-plugins-good
|
40 |
+
gstreamer1.0-plugins-bad
|
41 |
+
gstreamer1.0-plugins-ugly
|
42 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
pydub
|
3 |
+
yt-dlp
|
4 |
+
accelerate
|
5 |
+
playwright
|
subtitle.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Subtitle:
|
2 |
+
def __init__(self, ext="srt"):
|
3 |
+
sub_dict = {
|
4 |
+
"srt": {
|
5 |
+
"coma": ",",
|
6 |
+
"header": "",
|
7 |
+
"format": self._srt_format,
|
8 |
+
},
|
9 |
+
"vtt": {
|
10 |
+
"coma": ".",
|
11 |
+
"header": "WebVTT\n\n",
|
12 |
+
"format": self._vtt_format,
|
13 |
+
},
|
14 |
+
"txt": {
|
15 |
+
"coma": "",
|
16 |
+
"header": "",
|
17 |
+
"format": self._txt_format,
|
18 |
+
},
|
19 |
+
"lrc": {
|
20 |
+
"coma": "",
|
21 |
+
"header": "",
|
22 |
+
"format": self._lrc_format,
|
23 |
+
},
|
24 |
+
}
|
25 |
+
|
26 |
+
self.ext = ext
|
27 |
+
self.coma = sub_dict[ext]["coma"]
|
28 |
+
self.header = sub_dict[ext]["header"]
|
29 |
+
self.format_fn = sub_dict[ext]["format"]
|
30 |
+
|
31 |
+
def timeformat(self, time):
|
32 |
+
hours, remainder = divmod(time, 3600)
|
33 |
+
minutes, seconds = divmod(remainder, 60)
|
34 |
+
milliseconds = (time - int(time)) * 1000
|
35 |
+
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}{self.coma}{int(milliseconds):03d}"
|
36 |
+
|
37 |
+
def seconds_to_lrc_timestamp(self, time):
|
38 |
+
minutes = int(time // 60)
|
39 |
+
secs = time % 60
|
40 |
+
return f"[{minutes:02}:{secs:06.3f}]"
|
41 |
+
|
42 |
+
def _srt_format(self, i, segment):
|
43 |
+
start_time = self.timeformat(segment['timestamp'][0])
|
44 |
+
end_time = self.timeformat(segment['timestamp'][1] if segment['timestamp'][1] else segment['timestamp'][0])
|
45 |
+
return f"{i + 1}\n{start_time} --> {end_time}\n{segment['text']}\n\n"
|
46 |
+
|
47 |
+
def _vtt_format(self, i, segment):
|
48 |
+
start_time = self.timeformat(segment['timestamp'][0])
|
49 |
+
end_time = self.timeformat(segment['timestamp'][1] if segment['timestamp'][1] else segment['timestamp'][0])
|
50 |
+
return f"{start_time} --> {end_time}\n{segment['text']}\n\n"
|
51 |
+
|
52 |
+
def _txt_format(self, i, segment):
|
53 |
+
return f"{segment['text']}\n"
|
54 |
+
|
55 |
+
def _lrc_format(self, i, segment):
|
56 |
+
start_time = self.seconds_to_lrc_timestamp(segment['timestamp'][0])
|
57 |
+
return f"{start_time}{segment['text']}\n"
|
58 |
+
|
59 |
+
def get_subtitle(self, segments):
|
60 |
+
output = self.header
|
61 |
+
for i, segment in enumerate(segments):
|
62 |
+
segment['text'] = segment['text'].lstrip()
|
63 |
+
try:
|
64 |
+
output += self.format_fn(i, segment)
|
65 |
+
except Exception as e:
|
66 |
+
print(e, segment)
|
67 |
+
return output
|
68 |
+
|
69 |
+
def write_subtitle(self, segments, output_file):
|
70 |
+
output_file_with_ext = f"{output_file}.{self.ext}"
|
71 |
+
subtitle = self.get_subtitle(segments)
|
72 |
+
|
73 |
+
with open(output_file_with_ext, 'w', encoding='utf-8') as f:
|
74 |
+
f.write(subtitle)
|
75 |
+
|
76 |
+
def write_file(output_file,subtitle):
|
77 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
78 |
+
f.write(subtitle)
|
79 |
+
|
80 |
+
def subtitle_output(inputs, chunks):
|
81 |
+
file_name = inputs.split('/')[-1].split('.')[0]
|
82 |
+
lrc_sub = Subtitle("lrc")
|
83 |
+
srt_sub = Subtitle("srt")
|
84 |
+
vtt_sub = Subtitle("vtt")
|
85 |
+
txt_sub = Subtitle("txt")
|
86 |
+
lrc = lrc_sub.get_subtitle(chunks)
|
87 |
+
srt = srt_sub.get_subtitle(chunks)
|
88 |
+
vtt = vtt_sub.get_subtitle(chunks)
|
89 |
+
txt = txt_sub.get_subtitle(chunks)
|
90 |
+
write_file(file_name+".lrc",lrc)
|
91 |
+
write_file(file_name+".srt",srt)
|
92 |
+
write_file(file_name+".vtt",vtt)
|
93 |
+
write_file(file_name+".txt",txt)
|
94 |
+
files_out = [file_name+".lrc", file_name+".srt", file_name+".vtt", file_name+".txt"]
|
95 |
+
return lrc, files_out
|
96 |
+
|
97 |
+
def text_output(inputs, text):
|
98 |
+
file_name = inputs.split('/')[-1].split('.')[0]
|
99 |
+
write_file(file_name+".txt",text)
|
100 |
+
files_out = [file_name+".txt"]
|
101 |
+
return text, files_out
|