Create script to add subtitles to video
Browse files- add_subtitles_to_video.py +286 -0
add_subtitles_to_video.py
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import cv2
|
3 |
+
import re
|
4 |
+
from tqdm import tqdm
|
5 |
+
import os
|
6 |
+
# https://github.com/Zulko/moviepy/issues/401#issuecomment-278679961
|
7 |
+
|
8 |
+
DEBUG = False
|
9 |
+
|
10 |
+
COLOR_BLUE = (255, 0, 0)
|
11 |
+
COLOR_GREEN = (0, 255, 0)
|
12 |
+
COLOR_RED = (0, 0, 255)
|
13 |
+
COLOR_YELLOW = (0, 255, 255)
|
14 |
+
COLOR_WHITE = (255, 255, 255)
|
15 |
+
COLOR_BLACK = (0, 0, 0)
|
16 |
+
COLOR_BROWN = (0, 255, 255)
|
17 |
+
COLOR_MAGENTA = (255, 0, 255)
|
18 |
+
COLOR_ORANGE = (0, 165, 255)
|
19 |
+
COLOR_PURPLE = (128, 0, 128)
|
20 |
+
COLOR_GRAY = (128, 128, 128)
|
21 |
+
|
22 |
+
def replace_characters_that_opencv_cant_show(text):
|
23 |
+
text = text.replace("á", "a")
|
24 |
+
text = text.replace("é", "e")
|
25 |
+
text = text.replace("í", "i")
|
26 |
+
text = text.replace("ó", "o")
|
27 |
+
text = text.replace("ú", "u")
|
28 |
+
text = text.replace("ñ", "n")
|
29 |
+
text = text.replace("Á", "A")
|
30 |
+
text = text.replace("É", "E")
|
31 |
+
text = text.replace("Í", "I")
|
32 |
+
text = text.replace("Ó", "O")
|
33 |
+
text = text.replace("Ú", "U")
|
34 |
+
text = text.replace("Ñ", "N")
|
35 |
+
text = text.replace("\n", "")
|
36 |
+
text = text.replace("¿", "?")
|
37 |
+
text = text.replace("¡", "!")
|
38 |
+
return text
|
39 |
+
|
40 |
+
def remove_speaker_text(text):
|
41 |
+
# If text start with "[SPEAKER_XX]: " remove it
|
42 |
+
match = re.match(r"^\[SPEAKER_\d+\]:\s", text)
|
43 |
+
speaker = None
|
44 |
+
if match:
|
45 |
+
speaker = int(match.group(0)[9:11])
|
46 |
+
prefix_len = len(match.group(0)) # Get length of the matched text
|
47 |
+
text = text[prefix_len:] # Remove the matched text from the beginning
|
48 |
+
return text, speaker
|
49 |
+
|
50 |
+
def get_filter_text_and_speaker(text, color):
|
51 |
+
text, speaker = remove_speaker_text(text)
|
52 |
+
if speaker is not None:
|
53 |
+
if speaker == 0:
|
54 |
+
color = COLOR_GREEN
|
55 |
+
elif speaker == 1:
|
56 |
+
color = COLOR_BLUE
|
57 |
+
elif speaker == 2:
|
58 |
+
color = COLOR_RED
|
59 |
+
elif speaker == 3:
|
60 |
+
color = COLOR_YELLOW
|
61 |
+
elif speaker == 4:
|
62 |
+
color = COLOR_WHITE
|
63 |
+
elif speaker == 5:
|
64 |
+
color = COLOR_BLACK
|
65 |
+
elif speaker == 6:
|
66 |
+
color = COLOR_BROWN
|
67 |
+
elif speaker == 7:
|
68 |
+
color = COLOR_MAGENTA
|
69 |
+
elif speaker == 8:
|
70 |
+
color = COLOR_ORANGE
|
71 |
+
elif speaker == 9:
|
72 |
+
color = COLOR_PURPLE
|
73 |
+
return text, color
|
74 |
+
|
75 |
+
def create_dict_of_transcription(transcription_file):
|
76 |
+
transcription_dict = {}
|
77 |
+
|
78 |
+
with open(transcription_file, "r") as f:
|
79 |
+
transcriptions = f.read().splitlines()
|
80 |
+
|
81 |
+
for line in transcriptions:
|
82 |
+
|
83 |
+
# if line is dd:dd:dd,ddd --> dd:dd:dd,ddd (start and end time) add a new key to the dictionary
|
84 |
+
if re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line):
|
85 |
+
# Get start time (dd:dd:dd,ddd) and end time (dd:dd:dd,ddd)
|
86 |
+
start, end = line.split(" --> ")
|
87 |
+
# Add key to dictionary
|
88 |
+
transcription_dict[start] = ""
|
89 |
+
|
90 |
+
# if line is a number and carriage continue
|
91 |
+
elif re.match(r"\d+$", line):
|
92 |
+
continue
|
93 |
+
|
94 |
+
# if line is a carriage return continue
|
95 |
+
elif re.match(r"^$", line):
|
96 |
+
continue
|
97 |
+
|
98 |
+
# if line is a transcription add it to the dictionary
|
99 |
+
else:
|
100 |
+
# Remove characters that opencv can't show
|
101 |
+
line = replace_characters_that_opencv_cant_show(line)
|
102 |
+
transcription_dict[start] += f"{line}\n"
|
103 |
+
|
104 |
+
return transcription_dict
|
105 |
+
|
106 |
+
def hour_minute_seconds_miliseconds_to_seconds(time):
|
107 |
+
hours, minutes, seconds_miliseconds = time.split(":")
|
108 |
+
seconds, miliseconds = seconds_miliseconds.split(",")
|
109 |
+
seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000
|
110 |
+
return seconds
|
111 |
+
|
112 |
+
def seconds_to_hour_minute_seconds_miliseconds(seconds):
|
113 |
+
miliseconds = str(seconds).split(".")[1]
|
114 |
+
miliseconds = f"0.{miliseconds}"
|
115 |
+
miliseconds = float(miliseconds)
|
116 |
+
miliseconds = int(miliseconds * 1000)
|
117 |
+
hours = int(seconds) // 3600
|
118 |
+
minutes = (int(seconds) % 3600) // 60
|
119 |
+
seconds = (int(seconds) % 3600) % 60
|
120 |
+
time = f"{hours:02d}:{minutes:02d}:{seconds:02d},{miliseconds:03d}"
|
121 |
+
return time
|
122 |
+
|
123 |
+
def search_transcription_in_dict_of_transcription(transcription_dict, seconds):
|
124 |
+
# Get list of keys
|
125 |
+
keys = list(transcription_dict.keys())
|
126 |
+
|
127 |
+
# Search the key in the dictionary
|
128 |
+
for i in range(len(keys)-1):
|
129 |
+
key_hmsms = keys[i]
|
130 |
+
next_key_hmsms = keys[i+1]
|
131 |
+
key_seconds = hour_minute_seconds_miliseconds_to_seconds(key_hmsms)
|
132 |
+
next_key_seconds = hour_minute_seconds_miliseconds_to_seconds(next_key_hmsms)
|
133 |
+
if key_seconds <= seconds and seconds < next_key_seconds:
|
134 |
+
return transcription_dict[key_hmsms]
|
135 |
+
else:
|
136 |
+
continue
|
137 |
+
|
138 |
+
def get_length_of_cv2_text(text, fontFace, fontScale, thickness):
|
139 |
+
text_size, _ = cv2.getTextSize(text, fontFace, fontScale, thickness)
|
140 |
+
return text_size[0]
|
141 |
+
|
142 |
+
def add_subtitles_to_video(transcription_dict, input_video_file):
|
143 |
+
# Get the name of the input and output video files
|
144 |
+
input_video, input_video_extension = input_video_file.split(".")
|
145 |
+
input_video_folder, input_video_name = input_video.split("/")
|
146 |
+
output_video_folder = input_video_folder
|
147 |
+
output_video_name = input_video_name + "_with_subtitles"
|
148 |
+
output_video_extension = input_video_extension
|
149 |
+
output_video_file = f"{output_video_folder}/{output_video_name}.{output_video_extension}"
|
150 |
+
|
151 |
+
# Open the input video file
|
152 |
+
captured_video = cv2.VideoCapture(input_video_file)
|
153 |
+
captured_video_fps = captured_video.get(cv2.CAP_PROP_FPS)
|
154 |
+
captured_video_width = captured_video.get(cv2.CAP_PROP_FRAME_WIDTH)
|
155 |
+
captured_video_height = captured_video.get(cv2.CAP_PROP_FRAME_HEIGHT)
|
156 |
+
num_frames = int(captured_video.get(cv2.CAP_PROP_FRAME_COUNT))
|
157 |
+
|
158 |
+
# Progress bar
|
159 |
+
progress_bar = tqdm(total=num_frames, desc="Add subtitles to video progress")
|
160 |
+
|
161 |
+
# Video writer
|
162 |
+
if not DEBUG:
|
163 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
164 |
+
video = cv2.VideoWriter(output_video_file, fourcc, captured_video_fps, (int(captured_video_width), int(captured_video_height)))
|
165 |
+
|
166 |
+
# Set font properties
|
167 |
+
fontFace = cv2.FONT_HERSHEY_DUPLEX
|
168 |
+
fontScale = 1
|
169 |
+
thickness = 2
|
170 |
+
color = (0, 255, 0)
|
171 |
+
lineType = cv2.LINE_AA
|
172 |
+
bottomLeftOrigin = False
|
173 |
+
|
174 |
+
old_text = ""
|
175 |
+
while captured_video.isOpened():
|
176 |
+
# Read the next frame
|
177 |
+
ret, frame = captured_video.read()
|
178 |
+
if not ret:
|
179 |
+
break
|
180 |
+
|
181 |
+
# Add the text to the frame
|
182 |
+
current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000
|
183 |
+
text = search_transcription_in_dict_of_transcription(transcription_dict, current_time)
|
184 |
+
if text is not None:
|
185 |
+
if text[-1] == "\n":
|
186 |
+
text = text[:-1]
|
187 |
+
if text[-1] == " ":
|
188 |
+
text = text[:-1]
|
189 |
+
if old_text != text:
|
190 |
+
old_text = text
|
191 |
+
text_length = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
192 |
+
if text_length > captured_video_width:
|
193 |
+
necesary_rows = int(text_length // (captured_video_width-100)+1)
|
194 |
+
words = text.split(" ")
|
195 |
+
number_of_words = len(words)
|
196 |
+
words_per_row = int(number_of_words // necesary_rows)
|
197 |
+
text = ""
|
198 |
+
text_position = (50, int(captured_video_height)-50*(necesary_rows+1))
|
199 |
+
rectangle_point1 = (40, text_position[1]-30)
|
200 |
+
for i in range(number_of_words):
|
201 |
+
if i % words_per_row == 0 and i != 0:
|
202 |
+
text, color = get_filter_text_and_speaker(text, color)
|
203 |
+
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
204 |
+
if length_of_text > 10:
|
205 |
+
rectangle_point2 = (length_of_text+50, text_position[1]+10)
|
206 |
+
cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
|
207 |
+
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
|
208 |
+
text = ""
|
209 |
+
text_position = (50, text_position[1]+50)
|
210 |
+
rectangle_point1 = (40, text_position[1]-30)
|
211 |
+
text += words[i] + " "
|
212 |
+
# Add the last words
|
213 |
+
text, color = get_filter_text_and_speaker(text, color)
|
214 |
+
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
215 |
+
if length_of_text > 10:
|
216 |
+
rectangle_point2 = (length_of_text+50, text_position[1]+10)
|
217 |
+
cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
|
218 |
+
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
|
219 |
+
else:
|
220 |
+
text_position = (50, int(captured_video_height)-50)
|
221 |
+
rectangle_point1 = (40, text_position[1]-30)
|
222 |
+
rectangle_point2 = (int(captured_video_width)-50, text_position[1]+10)
|
223 |
+
if text is not None:
|
224 |
+
text, color = get_filter_text_and_speaker(text, color)
|
225 |
+
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness)
|
226 |
+
if length_of_text > 10:
|
227 |
+
rectangle_point2 = (length_of_text+50, text_position[1]+10)
|
228 |
+
cv2.rectangle(frame, rectangle_point1, rectangle_point2, COLOR_GRAY, -1, cv2.LINE_AA, 0)
|
229 |
+
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin)
|
230 |
+
|
231 |
+
# Update the progress bar
|
232 |
+
progress_bar.update(1)
|
233 |
+
|
234 |
+
# Show the frame
|
235 |
+
if DEBUG:
|
236 |
+
cv2.imshow('frame', frame)
|
237 |
+
# Set window 520x293
|
238 |
+
cv2.namedWindow("frame", cv2.WINDOW_NORMAL)
|
239 |
+
cv2.resizeWindow("frame", 520, 293)
|
240 |
+
if cv2.waitKey(1) & 0xFF == ord('q'):
|
241 |
+
break
|
242 |
+
|
243 |
+
# Write the frame
|
244 |
+
if not DEBUG:
|
245 |
+
video.write(frame)
|
246 |
+
|
247 |
+
# Release the video capture object
|
248 |
+
captured_video.release()
|
249 |
+
|
250 |
+
# Close all the frames
|
251 |
+
cv2.destroyAllWindows()
|
252 |
+
|
253 |
+
# Release the video writer object
|
254 |
+
if not DEBUG:
|
255 |
+
video.release()
|
256 |
+
|
257 |
+
# Add audio to the video
|
258 |
+
if not DEBUG:
|
259 |
+
progress_bar = tqdm(total=3, desc="Add audio to video progress")
|
260 |
+
command = f"ffmpeg -i {output_video_file} -i {input_audio_file} -c:v copy -c:a aac -strict experimental -loglevel warning {output_video_file}_with_audio.{output_video_extension}"
|
261 |
+
os.system(command)
|
262 |
+
progress_bar.update(1)
|
263 |
+
command = f"rm {output_video_file}"
|
264 |
+
os.system(command)
|
265 |
+
progress_bar.update(1)
|
266 |
+
command = f"mv {output_video_file}_with_audio.{output_video_extension} {output_video_file}"
|
267 |
+
os.system(command)
|
268 |
+
progress_bar.update(1)
|
269 |
+
|
270 |
+
if __name__ == "__main__":
|
271 |
+
parser = argparse.ArgumentParser()
|
272 |
+
parser.add_argument("transcription_file", help="Transcribed text")
|
273 |
+
parser.add_argument("input_video_file", help="Input video file")
|
274 |
+
parser.add_argument("input_audio_file", help="Input audio file")
|
275 |
+
args = parser.parse_args()
|
276 |
+
|
277 |
+
transcription_file = args.transcription_file
|
278 |
+
input_video_file = args.input_video_file
|
279 |
+
input_audio_file = args.input_audio_file
|
280 |
+
|
281 |
+
transcription_dict = create_dict_of_transcription(transcription_file)
|
282 |
+
# for key in transcription_dict.keys():
|
283 |
+
# print(key)
|
284 |
+
# print(transcription_dict[key])
|
285 |
+
# print("\n\n")
|
286 |
+
add_subtitles_to_video(transcription_dict, input_video_file)
|