BatuhanYilmaz commited on
Commit
5ca0a1c
1 Parent(s): 9c50460

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +259 -0
  2. languages.py +101 -0
  3. utils.py +96 -0
app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ from pytube import YouTube
3
+ import requests
4
+ import time
5
+ import streamlit as st
6
+ from streamlit_lottie import st_lottie
7
+ import numpy as np
8
+ import os
9
+ from typing import Iterator
10
+ from io import StringIO
11
+ from utils import write_vtt, write_srt
12
+ import ffmpeg
13
+ from languages import LANGUAGES
14
+
15
+ st.set_page_config(page_title="Auto Subtitled Video Generator", page_icon=":movie_camera:", layout="wide")
16
+
17
+ # Define a function that we can use to load lottie files from a link.
18
+ @st.cache(allow_output_mutation=True)
19
+ def load_lottieurl(url: str):
20
+ r = requests.get(url)
21
+ if r.status_code != 200:
22
+ return None
23
+ return r.json()
24
+
25
+ col1, col2 = st.columns([1, 3])
26
+ with col1:
27
+ lottie = load_lottieurl("https://assets8.lottiefiles.com/packages/lf20_jh9gfdye.json")
28
+ st_lottie(lottie, speed=1, height=250, width=250)
29
+
30
+ with col2:
31
+ st.write("""
32
+ ## Auto Subtitled Video Generator
33
+ ##### Input a YouTube video link and get a video with subtitles.
34
+ ###### ➠ If you want to transcribe the video in its original language, select the task as "Transcribe"
35
+ ###### ➠ If you want to translate the subtitles to English, select the task as "Translate"
36
+ ###### I recommend starting with the base model and then experimenting with the larger models, the small and medium models often work well. """)
37
+
38
+
39
+ @st.cache(allow_output_mutation=True)
40
+ def populate_metadata(link):
41
+ yt = YouTube(link)
42
+ author = yt.author
43
+ title = yt.title
44
+ description = yt.description
45
+ thumbnail = yt.thumbnail_url
46
+ length = yt.length
47
+ views = yt.views
48
+ return author, title, description, thumbnail, length, views
49
+
50
+
51
+ @st.cache(allow_output_mutation=True)
52
+ def download_video(link):
53
+ yt = YouTube(link)
54
+ video = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
55
+ return video
56
+
57
+
58
+ def convert(seconds):
59
+ return time.strftime("%H:%M:%S", time.gmtime(seconds))
60
+
61
+
62
+ loaded_model = whisper.load_model("base")
63
+ current_size = "None"
64
+
65
+
66
+ @st.cache(allow_output_mutation=True)
67
+ def change_model(current_size, size):
68
+ if current_size != size:
69
+ loaded_model = whisper.load_model(size)
70
+ return loaded_model
71
+ else:
72
+ raise Exception("Model size is the same as the current size.")
73
+
74
+
75
+ @st.cache(allow_output_mutation=True)
76
+ def inference(link, loaded_model, task):
77
+ yt = YouTube(link)
78
+ path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
79
+ if task == "Transcribe":
80
+ options = dict(task="transcribe", best_of=5)
81
+ results = loaded_model.transcribe(path, **options)
82
+ vtt = getSubs(results["segments"], "vtt", 80)
83
+ srt = getSubs(results["segments"], "srt", 80)
84
+ lang = results["language"]
85
+ return results["text"], vtt, srt, lang
86
+ elif task == "Translate":
87
+ options = dict(task="translate", best_of=5)
88
+ results = loaded_model.transcribe(path, **options)
89
+ vtt = getSubs(results["segments"], "vtt", 80)
90
+ srt = getSubs(results["segments"], "srt", 80)
91
+ lang = results["language"]
92
+ return results["text"], vtt, srt, lang
93
+ else:
94
+ raise ValueError("Task not supported")
95
+
96
+
97
+ @st.cache(allow_output_mutation=True)
98
+ def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
99
+ segmentStream = StringIO()
100
+
101
+ if format == 'vtt':
102
+ write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
103
+ elif format == 'srt':
104
+ write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
105
+ else:
106
+ raise Exception("Unknown format " + format)
107
+
108
+ segmentStream.seek(0)
109
+ return segmentStream.read()
110
+
111
+
112
+ def get_language_code(language):
113
+ if language in LANGUAGES.keys():
114
+ detected_language = LANGUAGES[language]
115
+ return detected_language
116
+ else:
117
+ raise ValueError("Language not supported")
118
+
119
+
120
+ def generate_subtitled_video(video, audio, transcript):
121
+ video_file = ffmpeg.input(video)
122
+ audio_file = ffmpeg.input(audio)
123
+ ffmpeg.concat(video_file.filter("subtitles", transcript), audio_file, v=1, a=1).output("final.mp4").run(quiet=True, overwrite_output=True)
124
+ video_with_subs = open("final.mp4", "rb")
125
+ return video_with_subs
126
+
127
+
128
+ def main():
129
+ size = st.selectbox("Select Model Size (The larger the model, the more accurate the transcription will be, but it will take longer)", ["tiny", "base", "small", "medium", "large"], index=1)
130
+ loaded_model = change_model(current_size, size)
131
+ st.write(f"Model is {'multilingual' if loaded_model.is_multilingual else 'English-only'} "
132
+ f"and has {sum(np.prod(p.shape) for p in loaded_model.parameters()):,} parameters.")
133
+ link = st.text_input("YouTube Link (The longer the video, the longer the processing time)")
134
+ task = st.selectbox("Select Task", ["Transcribe", "Translate"], index=0)
135
+ if task == "Transcribe":
136
+ if st.button("Transcribe"):
137
+ author, title, description, thumbnail, length, views = populate_metadata(link)
138
+ results = inference(link, loaded_model, task)
139
+ video = download_video(link)
140
+ lang = results[3]
141
+ detected_language = get_language_code(lang)
142
+
143
+ col3, col4 = st.columns(2)
144
+ col5, col6, col7, col8 = st.columns(4)
145
+ col9, col10 = st.columns(2)
146
+ with col3:
147
+ st.video(video)
148
+
149
+ # Write the results to a .txt file and download it.
150
+ with open("transcript.txt", "w+", encoding='utf8') as f:
151
+ f.writelines(results[0])
152
+ f.close()
153
+ with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
154
+ datatxt = f.read()
155
+
156
+
157
+ with open("transcript.vtt", "w+",encoding='utf8') as f:
158
+ f.writelines(results[1])
159
+ f.close()
160
+ with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
161
+ datavtt = f.read()
162
+
163
+ with open("transcript.srt", "w+",encoding='utf8') as f:
164
+ f.writelines(results[2])
165
+ f.close()
166
+ with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
167
+ datasrt = f.read()
168
+ with col5:
169
+ st.download_button(label="Download Transcript (.txt)",
170
+ data=datatxt,
171
+ file_name="transcript.txt")
172
+ with col6:
173
+ st.download_button(label="Download Transcript (.vtt)",
174
+ data=datavtt,
175
+ file_name="transcript.vtt")
176
+ with col7:
177
+ st.download_button(label="Download Transcript (.srt)",
178
+ data=datasrt,
179
+ file_name="transcript.srt")
180
+ with col9:
181
+ st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
182
+ with col10:
183
+ st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
184
+
185
+ with col4:
186
+ with st.spinner("Generating Subtitled Video"):
187
+ video_with_subs = generate_subtitled_video(video, "audio.mp4", "transcript.srt")
188
+ st.video(video_with_subs)
189
+ st.balloons()
190
+ with col8:
191
+ st.download_button(label="Download Subtitled Video",
192
+ data=video_with_subs,
193
+ file_name=f"{title} with subtitles.mp4")
194
+ elif task == "Translate":
195
+ if st.button("Translate to English"):
196
+ author, title, description, thumbnail, length, views = populate_metadata(link)
197
+ results = inference(link, loaded_model, task)
198
+ video = download_video(link)
199
+ lang = results[3]
200
+ detected_language = get_language_code(lang)
201
+
202
+ col3, col4 = st.columns(2)
203
+ col5, col6, col7, col8 = st.columns(4)
204
+ col9, col10 = st.columns(2)
205
+ with col3:
206
+ st.video(video)
207
+
208
+ # Write the results to a .txt file and download it.
209
+ with open("transcript.txt", "w+", encoding='utf8') as f:
210
+ f.writelines(results[0])
211
+ f.close()
212
+ with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
213
+ datatxt = f.read()
214
+
215
+
216
+ with open("transcript.vtt", "w+",encoding='utf8') as f:
217
+ f.writelines(results[1])
218
+ f.close()
219
+ with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
220
+ datavtt = f.read()
221
+
222
+ with open("transcript.srt", "w+",encoding='utf8') as f:
223
+ f.writelines(results[2])
224
+ f.close()
225
+ with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
226
+ datasrt = f.read()
227
+ with col5:
228
+ st.download_button(label="Download Transcript (.txt)",
229
+ data=datatxt,
230
+ file_name="transcript.txt")
231
+ with col6:
232
+ st.download_button(label="Download Transcript (.vtt)",
233
+ data=datavtt,
234
+ file_name="transcript.vtt")
235
+ with col7:
236
+ st.download_button(label="Download Transcript (.srt)",
237
+ data=datasrt,
238
+ file_name="transcript.srt")
239
+ with col9:
240
+ st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
241
+ with col10:
242
+ st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
243
+
244
+ with col4:
245
+ with st.spinner("Generating Subtitled Video"):
246
+ video_with_subs = generate_subtitled_video(video, "audio.mp4", "transcript.srt")
247
+ st.video(video_with_subs)
248
+ st.balloons()
249
+ with col8:
250
+ st.download_button(label="Download Subtitled Video",
251
+ data=video_with_subs,
252
+ file_name=f"{title} with subtitles.mp4")
253
+ else:
254
+ st.error("Please select a task.")
255
+
256
+
257
+ if __name__ == "__main__":
258
+ main()
259
+ st.markdown("###### Made with :heart: by [@BatuhanYılmaz](https://twitter.com/batuhan3326) [![this is an image link](https://i.imgur.com/thJhzOO.png)](https://www.buymeacoffee.com/batuhanylmz)")
languages.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LANGUAGES = {
2
+ "en": "eng",
3
+ "zh": "zho",
4
+ "de": "deu",
5
+ "es": "spa",
6
+ "ru": "rus",
7
+ "ko": "kor",
8
+ "fr": "fra",
9
+ "ja": "jpn",
10
+ "pt": "por",
11
+ "tr": "tur",
12
+ "pl": "pol",
13
+ "ca": "cat",
14
+ "nl": "nld",
15
+ "ar": "ara",
16
+ "sv": "swe",
17
+ "it": "ita",
18
+ "id": "ind",
19
+ "hi": "hin",
20
+ "fi": "fin",
21
+ "vi": "vie",
22
+ "iw": "heb",
23
+ "uk": "ukr",
24
+ "el": "ell",
25
+ "ms": "msa",
26
+ "cs": "ces",
27
+ "ro": "ron",
28
+ "da": "dan",
29
+ "hu": "hun",
30
+ "ta": "tam",
31
+ "no": "nor",
32
+ "th": "tha",
33
+ "ur": "urd",
34
+ "hr": "hrv",
35
+ "bg": "bul",
36
+ "lt": "lit",
37
+ "la": "lat",
38
+ "mi": "mri",
39
+ "ml": "mal",
40
+ "cy": "cym",
41
+ "sk": "slk",
42
+ "te": "tel",
43
+ "fa": "fas",
44
+ "lv": "lav",
45
+ "bn": "ben",
46
+ "sr": "srp",
47
+ "az": "aze",
48
+ "sl": "slv",
49
+ "kn": "kan",
50
+ "et": "est",
51
+ "mk": "mkd",
52
+ "br": "bre",
53
+ "eu": "eus",
54
+ "is": "isl",
55
+ "hy": "hye",
56
+ "ne": "nep",
57
+ "mn": "mon",
58
+ "bs": "bos",
59
+ "kk": "kaz",
60
+ "sq": "sqi",
61
+ "sw": "swa",
62
+ "gl": "glg",
63
+ "mr": "mar",
64
+ "pa": "pan",
65
+ "si": "sin",
66
+ "km": "khm",
67
+ "sn": "sna",
68
+ "yo": "yor",
69
+ "so": "som",
70
+ "af": "afr",
71
+ "oc": "oci",
72
+ "ka": "kat",
73
+ "be": "bel",
74
+ "tg": "tgk",
75
+ "sd": "snd",
76
+ "gu": "guj",
77
+ "am": "amh",
78
+ "yi": "yid",
79
+ "lo": "lao",
80
+ "uz": "uzb",
81
+ "fo": "fao",
82
+ "ht": "hat",
83
+ "ps": "pus",
84
+ "tk": "tuk",
85
+ "nn": "nno",
86
+ "mt": "mlt",
87
+ "sa": "san",
88
+ "lb": "ltz",
89
+ "my": "mya",
90
+ "bo": "bod",
91
+ "tl": "tgl",
92
+ "mg": "mlg",
93
+ "as": "asm",
94
+ "tt": "tat",
95
+ "haw": "haw",
96
+ "ln": "lin",
97
+ "ha": "hau",
98
+ "ba": "bak",
99
+ "jw": "jav",
100
+ "su": "sun",
101
+ }
utils.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import textwrap
2
+ import zlib
3
+ from typing import Iterator, TextIO
4
+
5
+
6
+ def exact_div(x, y):
7
+ assert x % y == 0
8
+ return x // y
9
+
10
+
11
+ def str2bool(string):
12
+ str2val = {"True": True, "False": False}
13
+ if string in str2val:
14
+ return str2val[string]
15
+ else:
16
+ raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
17
+
18
+
19
+ def optional_int(string):
20
+ return None if string == "None" else int(string)
21
+
22
+
23
+ def optional_float(string):
24
+ return None if string == "None" else float(string)
25
+
26
+
27
+ def compression_ratio(text) -> float:
28
+ return len(text) / len(zlib.compress(text.encode("utf-8")))
29
+
30
+
31
+ def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
32
+ assert seconds >= 0, "non-negative timestamp expected"
33
+ milliseconds = round(seconds * 1000.0)
34
+
35
+ hours = milliseconds // 3_600_000
36
+ milliseconds -= hours * 3_600_000
37
+
38
+ minutes = milliseconds // 60_000
39
+ milliseconds -= minutes * 60_000
40
+
41
+ seconds = milliseconds // 1_000
42
+ milliseconds -= seconds * 1_000
43
+
44
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
45
+ return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
46
+
47
+
48
+ def write_txt(transcript: Iterator[dict], file: TextIO):
49
+ for segment in transcript:
50
+ print(segment['text'].strip(), file=file, flush=True)
51
+
52
+
53
+ def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
54
+ print("WEBVTT\n", file=file)
55
+ for segment in transcript:
56
+ text = processText(segment['text'], maxLineWidth).replace('-->', '->')
57
+
58
+ print(
59
+ f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
60
+ f"{text}\n",
61
+ file=file,
62
+ flush=True,
63
+ )
64
+
65
+
66
+ def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
67
+ """
68
+ Write a transcript to a file in SRT format.
69
+ Example usage:
70
+ from pathlib import Path
71
+ from whisper.utils import write_srt
72
+ result = transcribe(model, audio_path, temperature=temperature, **args)
73
+ # save SRT
74
+ audio_basename = Path(audio_path).stem
75
+ with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
76
+ write_srt(result["segments"], file=srt)
77
+ """
78
+ for i, segment in enumerate(transcript, start=1):
79
+ text = processText(segment['text'].strip(), maxLineWidth).replace('-->', '->')
80
+
81
+ # write srt lines
82
+ print(
83
+ f"{i}\n"
84
+ f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
85
+ f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
86
+ f"{text}\n",
87
+ file=file,
88
+ flush=True,
89
+ )
90
+
91
+ def processText(text: str, maxLineWidth=None):
92
+ if (maxLineWidth is None or maxLineWidth < 0):
93
+ return text
94
+
95
+ lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
96
+ return '\n'.join(lines)