Maximofn commited on
Commit
098e9c6
1 Parent(s): f87e3ab

Create script to add subtitles to video

Browse files
Files changed (1) hide show
  1. add_subtitles_to_video.py +286 -0
add_subtitles_to_video.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import cv2
3
+ import re
4
+ from tqdm import tqdm
5
+ import os
6
+ # https://github.com/Zulko/moviepy/issues/401#issuecomment-278679961
7
+
8
+ DEBUG = False
9
+
10
+ COLOR_BLUE = (255, 0, 0)
11
+ COLOR_GREEN = (0, 255, 0)
12
+ COLOR_RED = (0, 0, 255)
13
+ COLOR_YELLOW = (0, 255, 255)
14
+ COLOR_WHITE = (255, 255, 255)
15
+ COLOR_BLACK = (0, 0, 0)
16
+ COLOR_BROWN = (0, 255, 255)
17
+ COLOR_MAGENTA = (255, 0, 255)
18
+ COLOR_ORANGE = (0, 165, 255)
19
+ COLOR_PURPLE = (128, 0, 128)
20
+ COLOR_GRAY = (128, 128, 128)
21
+
22
+ def replace_characters_that_opencv_cant_show(text):
23
+ text = text.replace("á", "a")
24
+ text = text.replace("é", "e")
25
+ text = text.replace("í", "i")
26
+ text = text.replace("ó", "o")
27
+ text = text.replace("ú", "u")
28
+ text = text.replace("ñ", "n")
29
+ text = text.replace("Á", "A")
30
+ text = text.replace("É", "E")
31
+ text = text.replace("Í", "I")
32
+ text = text.replace("Ó", "O")
33
+ text = text.replace("Ú", "U")
34
+ text = text.replace("Ñ", "N")
35
+ text = text.replace("\n", "")
36
+ text = text.replace("¿", "?")
37
+ text = text.replace("¡", "!")
38
+ return text
39
+
40
+ def remove_speaker_text(text):
41
+ # If text start with "[SPEAKER_XX]: " remove it
42
+ match = re.match(r"^\[SPEAKER_\d+\]:\s", text)
43
+ speaker = None
44
+ if match:
45
+ speaker = int(match.group(0)[9:11])
46
+ prefix_len = len(match.group(0)) # Get length of the matched text
47
+ text = text[prefix_len:] # Remove the matched text from the beginning
48
+ return text, speaker
49
+
50
+ def get_filter_text_and_speaker(text, color):
51
+ text, speaker = remove_speaker_text(text)
52
+ if speaker is not None:
53
+ if speaker == 0:
54
+ color = COLOR_GREEN
55
+ elif speaker == 1:
56
+ color = COLOR_BLUE
57
+ elif speaker == 2:
58
+ color = COLOR_RED
59
+ elif speaker == 3:
60
+ color = COLOR_YELLOW
61
+ elif speaker == 4:
62
+ color = COLOR_WHITE
63
+ elif speaker == 5:
64
+ color = COLOR_BLACK
65
+ elif speaker == 6:
66
+ color = COLOR_BROWN
67
+ elif speaker == 7:
68
+ color = COLOR_MAGENTA
69
+ elif speaker == 8:
70
+ color = COLOR_ORANGE
71
+ elif speaker == 9:
72
+ color = COLOR_PURPLE
73
+ return text, color
74
+
75
+ def create_dict_of_transcription(transcription_file):
76
+ transcription_dict = {}
77
+
78
+ with open(transcription_file, "r") as f:
79
+ transcriptions = f.read().splitlines()
80
+
81
+ for line in transcriptions:
82
+
83
+ # if line is dd:dd:dd,ddd --> dd:dd:dd,ddd (start and end time) add a new key to the dictionary
84
+ if re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line):
85
+ # Get start time (dd:dd:dd,ddd) and end time (dd:dd:dd,ddd)
86
+ start, end = line.split(" --> ")
87
+ # Add key to dictionary
88
+ transcription_dict[start] = ""
89
+
90
+ # if line is a number and carriage continue
91
+ elif re.match(r"\d+$", line):
92
+ continue
93
+
94
+ # if line is a carriage return continue
95
+ elif re.match(r"^$", line):
96
+ continue
97
+
98
+ # if line is a transcription add it to the dictionary
99
+ else:
100
+ # Remove characters that opencv can't show
101
+ line = replace_characters_that_opencv_cant_show(line)
102
+ transcription_dict[start] += f"{line}\n"
103
+
104
+ return transcription_dict
105
+
106
+ def hour_minute_seconds_miliseconds_to_seconds(time):
107
+ hours, minutes, seconds_miliseconds = time.split(":")
108
+ seconds, miliseconds = seconds_miliseconds.split(",")
109
+ seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000
110
+ return seconds
111
+
112
+ def seconds_to_hour_minute_seconds_miliseconds(seconds):
113
+ miliseconds = str(seconds).split(".")[1]
114
+ miliseconds = f"0.{miliseconds}"
115
+ miliseconds = float(miliseconds)
116
+ miliseconds = int(miliseconds * 1000)
117
+ hours = int(seconds) // 3600
118
+ minutes = (int(seconds) % 3600) // 60
119
+ seconds = (int(seconds) % 3600) % 60
120
+ time = f"{hours:02d}:{minutes:02d}:{seconds:02d},{miliseconds:03d}"
121
+ return time
122
+
123
+ def search_transcription_in_dict_of_transcription(transcription_dict, seconds):
124
+ # Get list of keys
125
+ keys = list(transcription_dict.keys())
126
+
127
+ # Search the key in the dictionary
128
+ for i in range(len(keys)-1):
129
+ key_hmsms = keys[i]
130
+ next_key_hmsms = keys[i+1]
131
+ key_seconds = hour_minute_seconds_miliseconds_to_seconds(key_hmsms)
132
+ next_key_seconds = hour_minute_seconds_miliseconds_to_seconds(next_key_hmsms)
133
+ if key_seconds <= seconds and seconds < next_key_seconds:
134
+ return transcription_dict[key_hmsms]
135
+ else:
136
+ continue
137
+
138
+ def get_length_of_cv2_text(text, fontFace, fontScale, thickness):
139
+ text_size, _ = cv2.getTextSize(text, fontFace, fontScale, thickness)
140
+ return text_size[0]
141
+
142
+ def add_subtitles_to_video(transcription_dict, input_video_file):
143
+ # Get the name of the input and output video files
144
+ input_video, input_video_extension = input_video_file.split(".")
145
+ input_video_folder, input_video_name = input_video.split("/")
146
+ output_video_folder = input_video_folder
147
+ output_video_name = input_video_name + "_with_subtitles"
148
+ output_video_extension = input_video_extension
149
+ output_video_file = f"{output_video_folder}/{output_video_name}.{output_video_extension}"
150
+
151
+ # Open the input video file
152
+ captured_video = cv2.VideoCapture(input_video_file)
153
+ captured_video_fps = captured_video.get(cv2.CAP_PROP_FPS)
154
+ captured_video_width = captured_video.get(cv2.CAP_PROP_FRAME_WIDTH)
155
+ captured_video_height = captured_video.get(cv2.CAP_PROP_FRAME_HEIGHT)
156
+ num_frames = int(captured_video.get(cv2.CAP_PROP_FRAME_COUNT))
157
+
158
+ # Progress bar
159
+ progress_bar = tqdm(total=num_frames, desc="Add subtitles to video progress")
160
+
161
+ # Video writer
162
+ if not DEBUG:
163
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
164
+ video = cv2.VideoWriter(output_video_file, fourcc, captured_video_fps, (int(captured_video_width), int(captured_video_height)))
165
+
166
+ # Set font properties
167
+ fontFace = cv2.FONT_HERSHEY_DUPLEX
168
+ fontScale = 1
169
+ thickness = 2
170
+ color = (0, 255, 0)
171
+ lineType = cv2.LINE_AA
172
+ bottomLeftOrigin = False
173
+
174
+ old_text = ""
175
+ while captured_video.isOpened():
176
+ # Read the next frame
177
+ ret, frame = captured_video.read()
178
+ if not ret:
179
+ break
180
+
181
+ # Add the text to the frame
182
+ current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000
183
+ text = search_transcription_in_dict_of_transcription(transcription_dict, current_time)
184
+ if text is not None:
185
+ if text[-1] == "\n":
186
+ text = text[:-1]
187
+ if text[-1] == " ":
188
+ text = text[:-1]
189
+ if old_text != text:
190
+ old_text = text
191
+ text_length = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
192
+ if text_length > captured_video_width:
193
+ necesary_rows = int(text_length // (captured_video_width-100)+1)
194
+ words = text.split(" ")
195
+ number_of_words = len(words)
196
+ words_per_row = int(number_of_words // necesary_rows)
197
+ text = ""
198
+ text_position = (50, int(captured_video_height)-50*(necesary_rows+1))
199
+ rectangle_point1 = (40, text_position[1]-30)
200
+ for i in range(number_of_words):
201
+ if i % words_per_row == 0 and i != 0:
202
+ text, color = get_filter_text_and_speaker(text, color)
203
+ length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
204
+ if length_of_text > 10:
205
+ rectangle_point2 = (length_of_text+50, text_position[1]+10)
206
+ cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
207
+ cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
208
+ text = ""
209
+ text_position = (50, text_position[1]+50)
210
+ rectangle_point1 = (40, text_position[1]-30)
211
+ text += words[i] + " "
212
+ # Add the last words
213
+ text, color = get_filter_text_and_speaker(text, color)
214
+ length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
215
+ if length_of_text > 10:
216
+ rectangle_point2 = (length_of_text+50, text_position[1]+10)
217
+ cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
218
+ cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
219
+ else:
220
+ text_position = (50, int(captured_video_height)-50)
221
+ rectangle_point1 = (40, text_position[1]-30)
222
+ rectangle_point2 = (int(captured_video_width)-50, text_position[1]+10)
223
+ if text is not None:
224
+ text, color = get_filter_text_and_speaker(text, color)
225
+ length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
226
+ if length_of_text > 10:
227
+ rectangle_point2 = (length_of_text+50, text_position[1]+10)
228
+ cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
229
+ cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
230
+
231
+ # Update the progress bar
232
+ progress_bar.update(1)
233
+
234
+ # Show the frame
235
+ if DEBUG:
236
+ cv2.imshow('frame', frame)
237
+ # Set window 520x293
238
+ cv2.namedWindow("frame", cv2.WINDOW_NORMAL)
239
+ cv2.resizeWindow("frame", 520, 293)
240
+ if cv2.waitKey(1) & 0xFF == ord('q'):
241
+ break
242
+
243
+ # Write the frame
244
+ if not DEBUG:
245
+ video.write(frame)
246
+
247
+ # Release the video capture object
248
+ captured_video.release()
249
+
250
+ # Close all the frames
251
+ cv2.destroyAllWindows()
252
+
253
+ # Release the video writer object
254
+ if not DEBUG:
255
+ video.release()
256
+
257
+ # Add audio to the video
258
+ if not DEBUG:
259
+ progress_bar = tqdm(total=3, desc="Add audio to video progress")
260
+ command = f"ffmpeg -i {output_video_file} -i {input_audio_file} -c:v copy -c:a aac -strict experimental -loglevel warning {output_video_file}_with_audio.{output_video_extension}"
261
+ os.system(command)
262
+ progress_bar.update(1)
263
+ command = f"rm {output_video_file}"
264
+ os.system(command)
265
+ progress_bar.update(1)
266
+ command = f"mv {output_video_file}_with_audio.{output_video_extension} {output_video_file}"
267
+ os.system(command)
268
+ progress_bar.update(1)
269
+
270
+ if __name__ == "__main__":
271
+ parser = argparse.ArgumentParser()
272
+ parser.add_argument("transcription_file", help="Transcribed text")
273
+ parser.add_argument("input_video_file", help="Input video file")
274
+ parser.add_argument("input_audio_file", help="Input audio file")
275
+ args = parser.parse_args()
276
+
277
+ transcription_file = args.transcription_file
278
+ input_video_file = args.input_video_file
279
+ input_audio_file = args.input_audio_file
280
+
281
+ transcription_dict = create_dict_of_transcription(transcription_file)
282
+ # for key in transcription_dict.keys():
283
+ # print(key)
284
+ # print(transcription_dict[key])
285
+ # print("\n\n")
286
+ add_subtitles_to_video(transcription_dict, input_video_file)