tensorops commited on
Commit
0c38b7e
1 Parent(s): ac3a02e

Add application file

Browse files
Files changed (2) hide show
  1. README.md +2 -0
  2. app.py +307 -0
README.md CHANGED
@@ -10,4 +10,6 @@ pinned: false
10
  license: mit
11
  ---
12
 
 
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
10
  license: mit
11
  ---
12
 
13
+ Adapted from and credits to https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
14
+
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import psutil
3
+ from pytube import YouTube
4
+ import time
5
+ import re
6
+ import pandas as pd
7
+ import pysrt
8
+ from pathlib import Path
9
+ import gradio as gr
10
+ import os
11
+ import requests
12
+ import json
13
+ import base64
14
+
15
+ os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
16
+ os.system('make -C ./whisper.cpp')
17
+ os.system('wget https://huggingface.co/datasets/tensorops/ggml-whisper-medium-th-combined/resolve/main/ggml-whisper-medium-th-combined.bin')
18
+
19
+
20
+ num_cores = psutil.cpu_count()
21
+ os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
22
+
23
+
24
+ transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
25
+
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ print("DEVICE IS: ")
28
+ print(device)
29
+
30
+ videos_out_path = Path("./videos_out")
31
+ videos_out_path.mkdir(parents=True, exist_ok=True)
32
+
33
+
34
+ def get_youtube(video_url):
35
+ yt = YouTube(video_url)
36
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by(
37
+ 'resolution').desc().first().download()
38
+ return abs_video_path
39
+
40
+
41
+ def speech_to_text(video_file_path):
42
+ """
43
+ # Youtube with translated subtitles using OpenAI Whisper models.
44
+ # Currently supports only Thai audio
45
+ This space allows you to:
46
+ 1. Download youtube video with a given url
47
+ 2. Watch it in the first video component
48
+ 3. Run automatic speech recognition on the video using fast Whisper models
49
+ 4. Burn the transcriptions to the original video and watch the video in the 2nd video component
50
+
51
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
52
+ This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
53
+ """
54
+
55
+ if (video_file_path == None):
56
+ raise ValueError("Error no video input")
57
+ print(video_file_path)
58
+ try:
59
+ _, file_ending = os.path.splitext(f'{video_file_path}')
60
+ print(f'file enging is {file_ending}')
61
+ print("starting conversion to wav")
62
+ os.system(
63
+ f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
64
+ print("conversion to wav ready")
65
+
66
+ print("starting whisper c++")
67
+ srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
68
+ os.system(f'rm -f {srt_path}')
69
+ os.system(
70
+ f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l "th" -m ./ggml-whisper-medium-th-combined.bin -osrt')
71
+ print("starting whisper done with whisper")
72
+ except Exception as e:
73
+ raise RuntimeError("Error converting video to audio")
74
+
75
+ try:
76
+
77
+ df = pd.DataFrame(columns=['start', 'end', 'text'])
78
+ srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
79
+ subs = pysrt.open(srt_path)
80
+
81
+ objects = []
82
+ for sub in subs:
83
+
84
+ start_hours = str(str(sub.start.hours) + "00")[0:2] if len(
85
+ str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
86
+ end_hours = str(str(sub.end.hours) + "00")[0:2] if len(
87
+ str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]
88
+
89
+ start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(
90
+ str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
91
+ end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(
92
+ str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]
93
+
94
+ start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(
95
+ str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
96
+ end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(
97
+ str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]
98
+
99
+ start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
100
+ end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
101
+ objects.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}',
102
+ f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])
103
+
104
+ for object in objects:
105
+ srt_to_df = {
106
+ 'start': [object[1]],
107
+ 'end': [object[2]],
108
+ 'text': [object[0]]
109
+ }
110
+
111
+ df = pd.concat([df, pd.DataFrame(srt_to_df)])
112
+
113
+ df.to_csv('subtitles.csv', index=False)
114
+
115
+ print("Starting SRT-file creation")
116
+ df.reset_index(inplace=True)
117
+ with open('subtitles.vtt', 'w', encoding="utf-8") as file:
118
+ print("Starting WEBVTT-file creation")
119
+
120
+ for i in range(len(df)):
121
+ if i == 0:
122
+ file.write('WEBVTT')
123
+ file.write('\n')
124
+
125
+ else:
126
+ file.write(str(i+1))
127
+ file.write('\n')
128
+ start = df.iloc[i]['start']
129
+
130
+ file.write(f"{start.strip()}")
131
+
132
+ stop = df.iloc[i]['end']
133
+
134
+ file.write(' --> ')
135
+ file.write(f"{stop}")
136
+ file.write('\n')
137
+ file.writelines(df.iloc[i]['text'])
138
+ if int(i) != len(df)-1:
139
+ file.write('\n\n')
140
+
141
+ print("WEBVTT DONE")
142
+
143
+ with open('subtitles.srt', 'w', encoding="utf-8") as file:
144
+ print("Starting SRT-file creation")
145
+
146
+ for i in range(len(df)):
147
+ file.write(str(i+1))
148
+ file.write('\n')
149
+ start = df.iloc[i]['start']
150
+
151
+ file.write(f"{start.strip()}")
152
+
153
+ stop = df.iloc[i]['end']
154
+
155
+ file.write(' --> ')
156
+ file.write(f"{stop}")
157
+ file.write('\n')
158
+ file.writelines(df.iloc[i]['text'])
159
+ if int(i) != len(df)-1:
160
+ file.write('\n\n')
161
+
162
+ print("SRT DONE")
163
+ subtitle_files = ['subtitles.vtt', 'subtitles.srt', 'subtitles.csv']
164
+
165
+ return df, subtitle_files
166
+
167
+ except Exception as e:
168
+ raise RuntimeError("Error Running inference with local model", e)
169
+
170
+
171
+ def burn_srt_to_video(srt_file, video_in):
172
+
173
+ print("Starting creation of video wit srt")
174
+
175
+ try:
176
+ video_out = video_in.replace('.mp4', '_out.mp4')
177
+ print(os.system('ls -lrth'))
178
+ print(video_in)
179
+ print(video_out)
180
+ command = 'ffmpeg -i "{}" -y -vf subtitles=./subtitles.srt "{}"'.format(
181
+ video_in, video_out)
182
+ os.system(command)
183
+
184
+ return video_out
185
+
186
+ except Exception as e:
187
+ print(e)
188
+ return video_out
189
+
190
+
191
+ def create_video_player(subtitle_files, video_in):
192
+
193
+ with open(video_in, "rb") as file:
194
+ video_base64 = base64.b64encode(file.read())
195
+ with open('./subtitles.vtt', "rb") as file:
196
+ subtitle_base64 = base64.b64encode(file.read())
197
+
198
+ video_player = f'''<video id="video" controls preload="metadata">
199
+ <source src="data:video/mp4;base64,{str(video_base64)[2:-1]}" type="video/mp4" />
200
+ <track
201
+ label="Thai"
202
+ kind="subtitles"
203
+ srclang="th"
204
+ src="data:text/vtt;base64,{str(subtitle_base64)[2:-1]}"
205
+ default />
206
+ </video>
207
+ '''
208
+ return video_player
209
+
210
+
211
+ # ---- Gradio Layout -----
212
+ video_in = gr.Video(label="Video file", mirror_webcam=False)
213
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
214
+ video_out = gr.Video(label="Video Out", mirror_webcam=False)
215
+
216
+
217
+ df_init = pd.DataFrame(columns=['start', 'end', 'text', 'translation'])
218
+
219
+ transcription_df = gr.DataFrame(value=df_init, label="Transcription dataframe", row_count=(
220
+ 0, "dynamic"), max_rows=10, wrap=True, overflow_row_behaviour='paginate')
221
+ transcription_and_translation_df = gr.DataFrame(
222
+ value=df_init, label="Transcription and translation dataframe", max_rows=10, wrap=True, overflow_row_behaviour='paginate')
223
+
224
+ subtitle_files = gr.File(
225
+ label="Download srt-file",
226
+ file_count="multiple",
227
+ type="file",
228
+ interactive=False,
229
+ )
230
+
231
+ video_player = gr.HTML(
232
+ '<p>video will be played here after you press the button at step 3')
233
+
234
+ demo = gr.Blocks(css='''
235
+ #cut_btn, #reset_btn { align-self:stretch; }
236
+ #\\31 3 { max-width: 540px; }
237
+ .output-markdown {max-width: 65ch !important;}
238
+ ''')
239
+ demo.encrypt = False
240
+ with demo:
241
+ transcription_var = gr.Variable()
242
+
243
+ with gr.Row():
244
+ with gr.Column():
245
+ gr.Markdown('''
246
+ ### This space allows you to:
247
+ ##### 1. Download youtube video with a given URL
248
+ ##### 2. Watch it in the first video component
249
+ ##### 3. Run automatic Thai speech recognition on the video using Whisper
250
+ ##### 4. Burn the translations to the original video and watch the video in the 2nd video component
251
+ ''')
252
+
253
+ with gr.Column():
254
+ gr.Markdown('''
255
+ ### 1. Insert Youtube URL below. Some test videos below:
256
+ ##### 1. https://www.youtube.com/watch?v=UIHPIESyIXM
257
+ ##### 2. https://www.youtube.com/watch?v=YlfaFK7OFUo
258
+ ''')
259
+
260
+ with gr.Row():
261
+ with gr.Column():
262
+ youtube_url_in.render()
263
+ download_youtube_btn = gr.Button("Step 1. Download Youtube video")
264
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
265
+ video_in])
266
+ print(video_in)
267
+
268
+ with gr.Row():
269
+ with gr.Column():
270
+ video_in.render()
271
+ with gr.Column():
272
+ gr.Markdown('''
273
+ ##### Here you can start the transcription process.
274
+ ##### Be aware that processing will take some time.
275
+ ''')
276
+ transcribe_btn = gr.Button("Step 2. Transcribe audio")
277
+ transcribe_btn.click(speech_to_text, [
278
+ video_in], [transcription_df, subtitle_files])
279
+
280
+ with gr.Row():
281
+ gr.Markdown('''
282
+ ##### Here you will get transcription output
283
+ ##### ''')
284
+
285
+ with gr.Row():
286
+ with gr.Column():
287
+ transcription_df.render()
288
+
289
+ with gr.Row():
290
+ with gr.Column():
291
+ gr.Markdown(
292
+ '''##### From here, you can download the transcription output in different formats. ''')
293
+ subtitle_files.render()
294
+
295
+ with gr.Row():
296
+ with gr.Column():
297
+ gr.Markdown('''
298
+ ##### Now press the Step 3. Button to create output video with translated transcriptions
299
+ ##### ''')
300
+ create_video_button = gr.Button(
301
+ "Step 3. Create and add subtitles to video")
302
+ print(video_in)
303
+ create_video_button.click(create_video_player, [subtitle_files, video_in], [
304
+ video_player])
305
+ video_player.render()
306
+
307
+ demo.launch()