TaiYouWeb commited on
Commit
83922b2
1 Parent(s): 70a9f6b

Initial Commit

Browse files
Files changed (5) hide show
  1. app.py +392 -0
  2. languages.py +147 -0
  3. packages.txt +42 -0
  4. requirements.txt +5 -0
  5. subtitle.py +101 -0
app.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ import gradio as gr
4
+ import yt_dlp as youtube_dl
5
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
6
+ from transformers.pipelines.audio_utils import ffmpeg_read
7
+
8
+ import tempfile
9
+ import os
10
+ import time
11
+ import requests
12
+ from playwright.sync_api import sync_playwright
13
+
14
+ from languages import get_language_names
15
+ from subtitle import text_output, subtitle_output
16
+
17
+ import subprocess
18
+
19
+ try:
20
+ import spaces
21
+ USING_SPACES = True
22
+ except ImportError:
23
+ USING_SPACES = False
24
+
25
+ subprocess.run(
26
+ "pip install flash-attn --no-build-isolation",
27
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
28
+ shell=True,
29
+ )
30
+
31
+ os.system("playwright install")
32
+
33
+ YT_LENGTH_LIMIT_S = 360
34
+ SPACES_GPU_DURATION = 90
35
+
36
+ device = 0 if torch.cuda.is_available() else "cpu"
37
+
38
+ def gpu_decorator(duration=60):
39
+ def actual_decorator(func):
40
+ if USING_SPACES:
41
+ return spaces.GPU(duration=duration)(func)
42
+ return func
43
+ return actual_decorator
44
+
45
+ def device_info():
46
+ try:
47
+ subprocess.run(["df", "-h"], check=True)
48
+ subprocess.run(["lsblk"], check=True)
49
+ subprocess.run(["free", "-h"], check=True)
50
+ subprocess.run(["lscpu"], check=True)
51
+ subprocess.run(["nvidia-smi"], check=True)
52
+ except subprocess.CalledProcessError as e:
53
+ print(f"Command failed: {e}")
54
+
55
+ @gpu_decorator(duration=SPACES_GPU_DURATION)
56
+ def transcribe(inputs, model, language, batch_size, chunk_length_s, stride_length_s, task, timestamp_mode, progress=gr.Progress(track_tqdm=True)):
57
+ try:
58
+ if inputs is None:
59
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
60
+
61
+ torch_dtype = torch.float16
62
+
63
+ model_gen = AutoModelForSpeechSeq2Seq.from_pretrained(
64
+ model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
65
+ )
66
+ model_gen.to(device)
67
+
68
+ processor = AutoProcessor.from_pretrained(model)
69
+ tokenizer = WhisperTokenizer.from_pretrained(model)
70
+
71
+ pipe = pipeline(
72
+ task="automatic-speech-recognition",
73
+ model=model_gen,
74
+ chunk_length_s=chunk_length_s,
75
+ stride_length_s=stride_length_s,
76
+ tokenizer=tokenizer,
77
+ feature_extractor=processor.feature_extractor,
78
+ torch_dtype=torch_dtype,
79
+ model_kwargs={"attn_implementation": "flash_attention_2"},
80
+ device=device,
81
+ )
82
+
83
+ generate_kwargs = {}
84
+ if language != "Automatic Detection" and model.endswith(".en") == False:
85
+ generate_kwargs["language"] = language
86
+ if model.endswith(".en") == False:
87
+ generate_kwargs["task"] = task
88
+
89
+ output = pipe(inputs, batch_size=batch_size, generate_kwargs=generate_kwargs, return_timestamps=timestamp_mode)
90
+
91
+ print(output)
92
+ print({"inputs": inputs, "model": model, "language": language, "batch_size": batch_size, "chunk_length_s": chunk_length_s, "stride_length_s": stride_length_s, "task": task, "timestamp_mode": timestamp_mode})
93
+
94
+ if not timestamp_mode:
95
+ text = output['text']
96
+ return text_output(inputs, text)
97
+ else:
98
+ chunks = output['chunks']
99
+ return subtitle_output(inputs, chunks)
100
+
101
+ except Exception as e:
102
+ error_message = str(e)
103
+ raise gr.Error(error_message, duration=10)
104
+
105
+ def _download_yt_audio(yt_url, filename):
106
+ info_loader = youtube_dl.YoutubeDL()
107
+
108
+ try:
109
+ info = info_loader.extract_info(yt_url, download=False)
110
+ except youtube_dl.utils.DownloadError as err:
111
+ raise gr.Error(str(err))
112
+
113
+ file_length = info.get("duration_string")
114
+ if not file_length:
115
+ raise gr.Error("Video duration is unavailable.")
116
+
117
+ file_h_m_s = file_length.split(":")
118
+ file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
119
+
120
+ if len(file_h_m_s) == 1:
121
+ file_h_m_s.insert(0, 0)
122
+ if len(file_h_m_s) == 2:
123
+ file_h_m_s.insert(0, 0)
124
+
125
+ file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
126
+
127
+ if file_length_s > YT_LENGTH_LIMIT_S:
128
+ yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
129
+ file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
130
+ raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.", duration=10)
131
+
132
+ try:
133
+ ydl_opts = {
134
+ "outtmpl": filename,
135
+ "format": "bestaudio[ext=m4a]/best",
136
+ }
137
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
138
+ ydl.download([yt_url])
139
+ except youtube_dl.utils.ExtractorError as err:
140
+ available_formats = info_loader.extract_info(yt_url, download=False)['formats']
141
+ raise gr.Error(f"Requested format not available. Available formats: {available_formats}", duration=10)
142
+
143
+ def _return_yt_video_id(yt_url):
144
+ if "https://www.youtube.com/watch?v=" in yt_url:
145
+ video_id = yt_url.split("?v=")[-1]
146
+ elif "https://youtu.be/" in yt_url:
147
+ video_id = yt_url.split("be/")[1]
148
+ return video_id
149
+
150
+ def _return_yt_html_embed(yt_url):
151
+ video_id = _return_yt_video_id(yt_url)
152
+ HTML_str = (
153
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
154
+ " </center>"
155
+ )
156
+ return HTML_str
157
+
158
+ def _return_yt_thumbnail(yt_url):
159
+ video_id = _return_yt_video_id(yt_url)
160
+ if not video_id:
161
+ raise ValueError("Invalid YouTube URL: Unable to extract video ID.")
162
+ thumbnail_url = f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg"
163
+ thumbnail_path = None
164
+ try:
165
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
166
+ response = requests.get(thumbnail_url)
167
+ if response.status_code == 200:
168
+ temp_file.write(response.content)
169
+ thumbnail_path = temp_file.name
170
+ else:
171
+ raise Exception(f"Failed to retrieve thumbnail. Status code: {response.status_code}")
172
+ except Exception as e:
173
+ print(f"Error occurred: {e}")
174
+ return None
175
+ return thumbnail_path
176
+
177
+ def _return_yt_info(yt_url):
178
+ video_id = _return_yt_video_id(yt_url)
179
+ try:
180
+ with sync_playwright() as p:
181
+ browser = p.chromium.launch(headless=True)
182
+ page = browser.new_page()
183
+
184
+ page.goto(yt_url)
185
+
186
+ page.wait_for_load_state("networkidle")
187
+
188
+ title = page.title()
189
+ description = page.query_selector("meta[name='description']").get_attribute("content")
190
+ keywords = page.query_selector("meta[name='keywords']").get_attribute("content")
191
+
192
+ gr_title = gr.Textbox(label="YouTube Title", visible=True, value=title)
193
+ gr_description = gr.Textbox(label="YouTube Description", visible=True, value=description)
194
+ gr_keywords = gr.Textbox(label="YouTube Keywords", visible=True, value=keywords)
195
+
196
+ browser.close()
197
+ return gr_title, gr_description, gr_keywords
198
+ except Exception as e:
199
+ print(e)
200
+ return gr.Textbox(visible=False), gr.Textbox(visible=False), gr.Textbox(visible=False)
201
+
202
+
203
+ def return_youtube(yt_url):
204
+ html_embed_str = _return_yt_html_embed(yt_url)
205
+ thumbnail = _return_yt_thumbnail(yt_url)
206
+ gr_html = gr.HTML(label="Youtube Video", visible=True, value=html_embed_str)
207
+ gr_thumbnail = gr.Image(label="Youtube Thumbnail", visible=True, value=thumbnail)
208
+ gr_title, gr_description, gr_keywords = _return_yt_info(yt_url)
209
+ return gr_html, gr_thumbnail, gr_title, gr_description, gr_keywords
210
+
211
+ @gpu_decorator(duration=SPACES_GPU_DURATION)
212
+ def yt_transcribe(yt_url, model, language, batch_size, chunk_length_s, stride_length_s, task, timestamp_mode):
213
+ gr_html, gr_thumbnail, gr_title, gr_description, gr_keywords = return_youtube(yt_url)
214
+ try:
215
+ with tempfile.TemporaryDirectory() as tmpdirname:
216
+ filepath = os.path.join(tmpdirname, "video.mp4")
217
+ _download_yt_audio(yt_url, filepath)
218
+ with open(filepath, "rb") as f:
219
+ inputs = f.read()
220
+
221
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
222
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
223
+
224
+ torch_dtype = torch.float16
225
+
226
+ model_gen = AutoModelForSpeechSeq2Seq.from_pretrained(
227
+ model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
228
+ )
229
+ model_gen.to(device)
230
+
231
+ processor = AutoProcessor.from_pretrained(model)
232
+ tokenizer = WhisperTokenizer.from_pretrained(model)
233
+
234
+ pipe = pipeline(
235
+ task="automatic-speech-recognition",
236
+ model=model_gen,
237
+ chunk_length_s=chunk_length_s,
238
+ stride_length_s=stride_length_s,
239
+ tokenizer=tokenizer,
240
+ feature_extractor=processor.feature_extractor,
241
+ torch_dtype=torch_dtype,
242
+ model_kwargs={"attn_implementation": "flash_attention_2"},
243
+ device=device,
244
+ )
245
+
246
+ generate_kwargs = {}
247
+ if language != "Automatic Detection" and model.endswith(".en") == False:
248
+ generate_kwargs["language"] = language
249
+ if model.endswith(".en") == False:
250
+ generate_kwargs["task"] = task
251
+
252
+ output = pipe(inputs, batch_size=batch_size, generate_kwargs=generate_kwargs, return_timestamps=timestamp_mode)
253
+
254
+ print(output)
255
+ print({"inputs": yt_url, "model": model, "language": language, "batch_size": batch_size, "chunk_length_s": chunk_length_s, "stride_length_s": stride_length_s, "task": task, "timestamp_mode": timestamp_mode})
256
+
257
+ if not timestamp_mode:
258
+ text = output['text']
259
+ subtitle, files = text_output(inputs, text)
260
+ else:
261
+ chunks = output['chunks']
262
+ subtitle, files = subtitle_output(inputs, chunks)
263
+ return subtitle, files, gr_title, gr_html, gr_thumbnail, gr_description, gr_keywords
264
+
265
+ except Exception as e:
266
+ error_message = str(e)
267
+ gr.Warning(error_message, duration=10)
268
+ return gr.Textbox(visible=False),gr.Textbox(visible=False), gr_title, gr_html, gr_thumbnail, gr_description, gr_keywords
269
+
270
+ demo = gr.Blocks()
271
+
272
+ file_transcribe = gr.Interface(
273
+ fn=transcribe,
274
+ inputs=[
275
+ gr.Audio(sources=['upload', 'microphone'], type="filepath", label="Audio file"),
276
+ gr.Dropdown(
277
+ choices=[
278
+ "openai/whisper-tiny",
279
+ "openai/whisper-base",
280
+ "openai/whisper-small",
281
+ "openai/whisper-medium",
282
+ "openai/whisper-large",
283
+ "openai/whisper-large-v1",
284
+ "openai/whisper-large-v2", "distil-whisper/distil-large-v2",
285
+ "openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2",
286
+ ],
287
+ value="openai/whisper-large-v3-turbo",
288
+ label="Model Name",
289
+ allow_custom_value=True,
290
+ ),
291
+ gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", interactive = True,),
292
+ gr.Slider(label="Batch Size", minimum=1, maximum=32, value=16, step=1),
293
+ gr.Slider(label="Chunk Length (s)", minimum=1, maximum=60, value=17.5, step=0.1),
294
+ gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
295
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
296
+ gr.Dropdown(
297
+ choices=[True, False, "word"],
298
+ value=True,
299
+ label="Timestamp Mode"
300
+ ),
301
+ ],
302
+ outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")],
303
+ title="Whisper: Transcribe Audio",
304
+ flagging_mode="auto",
305
+ )
306
+
307
+ video_transcribe = gr.Interface(
308
+ fn=transcribe,
309
+ inputs=[
310
+ gr.Video(sources=["upload", "webcam"], label="Video file", show_label=False, show_download_button=False, show_share_button=False, streaming=True),
311
+ gr.Dropdown(
312
+ choices=[
313
+ "openai/whisper-tiny",
314
+ "openai/whisper-base",
315
+ "openai/whisper-small",
316
+ "openai/whisper-medium",
317
+ "openai/whisper-large",
318
+ "openai/whisper-large-v1",
319
+ "openai/whisper-large-v2", "distil-whisper/distil-large-v2",
320
+ "openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2",
321
+ ],
322
+ value="openai/whisper-large-v3-turbo",
323
+ label="Model Name",
324
+ allow_custom_value=True,
325
+ ),
326
+ gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", interactive = True,),
327
+ gr.Slider(label="Batch Size", minimum=1, maximum=32, value=16, step=1),
328
+ gr.Slider(label="Chunk Length (s)", minimum=1, maximum=60, value=17.5, step=0.1),
329
+ gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
330
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
331
+ gr.Dropdown(
332
+ choices=[True, False, "word"],
333
+ value=True,
334
+ label="Timestamp Mode"
335
+ ),
336
+ ],
337
+ outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")],
338
+ title="Whisper: Transcribe Video",
339
+ flagging_mode="auto",
340
+ )
341
+
342
+ yt_transcribe = gr.Interface(
343
+ fn=yt_transcribe,
344
+ inputs=[
345
+ gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
346
+ gr.Dropdown(
347
+ choices=[
348
+ "openai/whisper-tiny",
349
+ "openai/whisper-base",
350
+ "openai/whisper-small",
351
+ "openai/whisper-medium",
352
+ "openai/whisper-large",
353
+ "openai/whisper-large-v1",
354
+ "openai/whisper-large-v2", "distil-whisper/distil-large-v2",
355
+ "openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2",
356
+ ],
357
+ value="openai/whisper-large-v3-turbo",
358
+ label="Model Name",
359
+ allow_custom_value=True,
360
+ ),
361
+ gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", interactive = True,),
362
+ gr.Slider(label="Batch Size", minimum=1, maximum=32, value=16, step=1),
363
+ gr.Slider(label="Chunk Length (s)", minimum=1, maximum=60, value=17.5, step=0.1),
364
+ gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
365
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
366
+ gr.Dropdown(
367
+ choices=[True, False, "word"],
368
+ value=True,
369
+ label="Timestamp Mode"
370
+ ),
371
+ ],
372
+ outputs=[
373
+ gr.Textbox(label="Output"),
374
+ gr.File(label="Download Files"),
375
+ gr.Textbox(label="Youtube Title"),
376
+ gr.HTML(label="Youtube Video"),
377
+ gr.Image(label="Youtube Thumbnail"),
378
+ gr.Textbox(label="Youtube Description"),
379
+ gr.Textbox(label="Youtube Keywords"),
380
+ ],
381
+ title="Whisper: Transcribe YouTube",
382
+ flagging_mode="auto",
383
+ )
384
+
385
+ with demo:
386
+ gr.TabbedInterface(
387
+ interface_list=[file_transcribe, video_transcribe, yt_transcribe],
388
+ tab_names=["Audio", "Video", "YouTube"]
389
+ )
390
+
391
+ if __name__ == "__main__":
392
+ demo.queue().launch(ssr_mode=False)
languages.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Language():
2
+ def __init__(self, code, name):
3
+ self.code = code
4
+ self.name = name
5
+
6
+ def __str__(self):
7
+ return "Language(code={}, name={})".format(self.code, self.name)
8
+
9
+ LANGUAGES = [
10
+ Language('en', 'English'),
11
+ Language('zh', 'Chinese'),
12
+ Language('de', 'German'),
13
+ Language('es', 'Spanish'),
14
+ Language('ru', 'Russian'),
15
+ Language('ko', 'Korean'),
16
+ Language('fr', 'French'),
17
+ Language('ja', 'Japanese'),
18
+ Language('pt', 'Portuguese'),
19
+ Language('tr', 'Turkish'),
20
+ Language('pl', 'Polish'),
21
+ Language('ca', 'Catalan'),
22
+ Language('nl', 'Dutch'),
23
+ Language('ar', 'Arabic'),
24
+ Language('sv', 'Swedish'),
25
+ Language('it', 'Italian'),
26
+ Language('id', 'Indonesian'),
27
+ Language('hi', 'Hindi'),
28
+ Language('fi', 'Finnish'),
29
+ Language('vi', 'Vietnamese'),
30
+ Language('he', 'Hebrew'),
31
+ Language('uk', 'Ukrainian'),
32
+ Language('el', 'Greek'),
33
+ Language('ms', 'Malay'),
34
+ Language('cs', 'Czech'),
35
+ Language('ro', 'Romanian'),
36
+ Language('da', 'Danish'),
37
+ Language('hu', 'Hungarian'),
38
+ Language('ta', 'Tamil'),
39
+ Language('no', 'Norwegian'),
40
+ Language('th', 'Thai'),
41
+ Language('ur', 'Urdu'),
42
+ Language('hr', 'Croatian'),
43
+ Language('bg', 'Bulgarian'),
44
+ Language('lt', 'Lithuanian'),
45
+ Language('la', 'Latin'),
46
+ Language('mi', 'Maori'),
47
+ Language('ml', 'Malayalam'),
48
+ Language('cy', 'Welsh'),
49
+ Language('sk', 'Slovak'),
50
+ Language('te', 'Telugu'),
51
+ Language('fa', 'Persian'),
52
+ Language('lv', 'Latvian'),
53
+ Language('bn', 'Bengali'),
54
+ Language('sr', 'Serbian'),
55
+ Language('az', 'Azerbaijani'),
56
+ Language('sl', 'Slovenian'),
57
+ Language('kn', 'Kannada'),
58
+ Language('et', 'Estonian'),
59
+ Language('mk', 'Macedonian'),
60
+ Language('br', 'Breton'),
61
+ Language('eu', 'Basque'),
62
+ Language('is', 'Icelandic'),
63
+ Language('hy', 'Armenian'),
64
+ Language('ne', 'Nepali'),
65
+ Language('mn', 'Mongolian'),
66
+ Language('bs', 'Bosnian'),
67
+ Language('kk', 'Kazakh'),
68
+ Language('sq', 'Albanian'),
69
+ Language('sw', 'Swahili'),
70
+ Language('gl', 'Galician'),
71
+ Language('mr', 'Marathi'),
72
+ Language('pa', 'Punjabi'),
73
+ Language('si', 'Sinhala'),
74
+ Language('km', 'Khmer'),
75
+ Language('sn', 'Shona'),
76
+ Language('yo', 'Yoruba'),
77
+ Language('so', 'Somali'),
78
+ Language('af', 'Afrikaans'),
79
+ Language('oc', 'Occitan'),
80
+ Language('ka', 'Georgian'),
81
+ Language('be', 'Belarusian'),
82
+ Language('tg', 'Tajik'),
83
+ Language('sd', 'Sindhi'),
84
+ Language('gu', 'Gujarati'),
85
+ Language('am', 'Amharic'),
86
+ Language('yi', 'Yiddish'),
87
+ Language('lo', 'Lao'),
88
+ Language('uz', 'Uzbek'),
89
+ Language('fo', 'Faroese'),
90
+ Language('ht', 'Haitian creole'),
91
+ Language('ps', 'Pashto'),
92
+ Language('tk', 'Turkmen'),
93
+ Language('nn', 'Nynorsk'),
94
+ Language('mt', 'Maltese'),
95
+ Language('sa', 'Sanskrit'),
96
+ Language('lb', 'Luxembourgish'),
97
+ Language('my', 'Myanmar'),
98
+ Language('bo', 'Tibetan'),
99
+ Language('tl', 'Tagalog'),
100
+ Language('mg', 'Malagasy'),
101
+ Language('as', 'Assamese'),
102
+ Language('tt', 'Tatar'),
103
+ Language('haw', 'Hawaiian'),
104
+ Language('ln', 'Lingala'),
105
+ Language('ha', 'Hausa'),
106
+ Language('ba', 'Bashkir'),
107
+ Language('jw', 'Javanese'),
108
+ Language('su', 'Sundanese')
109
+ ]
110
+
111
+ _TO_LANGUAGE_CODE = {
112
+ **{language.code: language for language in LANGUAGES},
113
+ "burmese": "my",
114
+ "valencian": "ca",
115
+ "flemish": "nl",
116
+ "haitian": "ht",
117
+ "letzeburgesch": "lb",
118
+ "pushto": "ps",
119
+ "panjabi": "pa",
120
+ "moldavian": "ro",
121
+ "moldovan": "ro",
122
+ "sinhalese": "si",
123
+ "castilian": "es",
124
+ }
125
+
126
+ _FROM_LANGUAGE_NAME = {
127
+ **{language.name.lower(): language for language in LANGUAGES}
128
+ }
129
+
130
+ def get_language_from_code(language_code, default=None) -> Language:
131
+ """Return the language name from the language code."""
132
+ return _TO_LANGUAGE_CODE.get(language_code, default)
133
+
134
+ def get_language_from_name(language, default=None) -> Language:
135
+ """Return the language code from the language name."""
136
+ return _FROM_LANGUAGE_NAME.get(language.lower() if language else None, default)
137
+
138
+ def get_language_names():
139
+ """Return a list of language names."""
140
+ return [language.name for language in LANGUAGES]
141
+
142
+ if __name__ == "__main__":
143
+ # Test lookup
144
+ print(get_language_from_code('en'))
145
+ print(get_language_from_name('English'))
146
+
147
+ print(get_language_names())
packages.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ffmpeg
2
+ libnss3
3
+ libnspr4
4
+ libatk1.0-0
5
+ libatk-bridge2.0-0
6
+ libcups2
7
+ libxcomposite1
8
+ libxdamage1
9
+ libxrandr2
10
+ libgbm1
11
+ libpango-1.0-0
12
+ libpangocairo-1.0-0
13
+ libasound2
14
+ libxshmfence1
15
+ libx11-xcb1
16
+ libxext6
17
+ libxtst6
18
+ libxinerama1
19
+ libwayland-client0
20
+ libwayland-cursor0
21
+ libwayland-egl1
22
+ libdbus-1-3
23
+ libatspi2.0-0
24
+ libdrm2
25
+ libgtk-3-0
26
+ libgdk-pixbuf2.0-0
27
+ libgstreamer1.0-0
28
+ libwoff1
29
+ libgstreamer-plugins-base1.0-0
30
+ libgstreamer-gl1.0-0
31
+ libharfbuzz-icu0
32
+ libenchant-2-2
33
+ libsecret-1-0
34
+ libhyphen0
35
+ libmanette-0.2-0
36
+ libgles2
37
+ libgstreamer1.0-0
38
+ libgstreamer-plugins-base1.0-0
39
+ gstreamer1.0-plugins-good
40
+ gstreamer1.0-plugins-bad
41
+ gstreamer1.0-plugins-ugly
42
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ pydub
3
+ yt-dlp
4
+ accelerate
5
+ playwright
subtitle.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Subtitle:
2
+ def __init__(self, ext="srt"):
3
+ sub_dict = {
4
+ "srt": {
5
+ "coma": ",",
6
+ "header": "",
7
+ "format": self._srt_format,
8
+ },
9
+ "vtt": {
10
+ "coma": ".",
11
+ "header": "WebVTT\n\n",
12
+ "format": self._vtt_format,
13
+ },
14
+ "txt": {
15
+ "coma": "",
16
+ "header": "",
17
+ "format": self._txt_format,
18
+ },
19
+ "lrc": {
20
+ "coma": "",
21
+ "header": "",
22
+ "format": self._lrc_format,
23
+ },
24
+ }
25
+
26
+ self.ext = ext
27
+ self.coma = sub_dict[ext]["coma"]
28
+ self.header = sub_dict[ext]["header"]
29
+ self.format_fn = sub_dict[ext]["format"]
30
+
31
+ def timeformat(self, time):
32
+ hours, remainder = divmod(time, 3600)
33
+ minutes, seconds = divmod(remainder, 60)
34
+ milliseconds = (time - int(time)) * 1000
35
+ return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}{self.coma}{int(milliseconds):03d}"
36
+
37
+ def seconds_to_lrc_timestamp(self, time):
38
+ minutes = int(time // 60)
39
+ secs = time % 60
40
+ return f"[{minutes:02}:{secs:06.3f}]"
41
+
42
+ def _srt_format(self, i, segment):
43
+ start_time = self.timeformat(segment['timestamp'][0])
44
+ end_time = self.timeformat(segment['timestamp'][1] if segment['timestamp'][1] else segment['timestamp'][0])
45
+ return f"{i + 1}\n{start_time} --> {end_time}\n{segment['text']}\n\n"
46
+
47
+ def _vtt_format(self, i, segment):
48
+ start_time = self.timeformat(segment['timestamp'][0])
49
+ end_time = self.timeformat(segment['timestamp'][1] if segment['timestamp'][1] else segment['timestamp'][0])
50
+ return f"{start_time} --> {end_time}\n{segment['text']}\n\n"
51
+
52
+ def _txt_format(self, i, segment):
53
+ return f"{segment['text']}\n"
54
+
55
+ def _lrc_format(self, i, segment):
56
+ start_time = self.seconds_to_lrc_timestamp(segment['timestamp'][0])
57
+ return f"{start_time}{segment['text']}\n"
58
+
59
+ def get_subtitle(self, segments):
60
+ output = self.header
61
+ for i, segment in enumerate(segments):
62
+ segment['text'] = segment['text'].lstrip()
63
+ try:
64
+ output += self.format_fn(i, segment)
65
+ except Exception as e:
66
+ print(e, segment)
67
+ return output
68
+
69
+ def write_subtitle(self, segments, output_file):
70
+ output_file_with_ext = f"{output_file}.{self.ext}"
71
+ subtitle = self.get_subtitle(segments)
72
+
73
+ with open(output_file_with_ext, 'w', encoding='utf-8') as f:
74
+ f.write(subtitle)
75
+
76
+ def write_file(output_file,subtitle):
77
+ with open(output_file, 'w', encoding='utf-8') as f:
78
+ f.write(subtitle)
79
+
80
+ def subtitle_output(inputs, chunks):
81
+ file_name = inputs.split('/')[-1].split('.')[0]
82
+ lrc_sub = Subtitle("lrc")
83
+ srt_sub = Subtitle("srt")
84
+ vtt_sub = Subtitle("vtt")
85
+ txt_sub = Subtitle("txt")
86
+ lrc = lrc_sub.get_subtitle(chunks)
87
+ srt = srt_sub.get_subtitle(chunks)
88
+ vtt = vtt_sub.get_subtitle(chunks)
89
+ txt = txt_sub.get_subtitle(chunks)
90
+ write_file(file_name+".lrc",lrc)
91
+ write_file(file_name+".srt",srt)
92
+ write_file(file_name+".vtt",vtt)
93
+ write_file(file_name+".txt",txt)
94
+ files_out = [file_name+".lrc", file_name+".srt", file_name+".vtt", file_name+".txt"]
95
+ return lrc, files_out
96
+
97
+ def text_output(inputs, text):
98
+ file_name = inputs.split('/')[-1].split('.')[0]
99
+ write_file(file_name+".txt",text)
100
+ files_out = [file_name+".txt"]
101
+ return text, files_out