|
import argparse |
|
import cv2 |
|
import re |
|
from tqdm import tqdm |
|
import os |
|
|
|
|
|
DEBUG = False |
|
|
|
COLOR_BLUE = (255, 0, 0) |
|
BACKGROUND_FOR_COLOR_BLUE = (255, 255, 0) |
|
COLOR_GREEN = (0, 255, 0) |
|
BACKGROUND_FOR_COLOR_GREEN = (255, 0, 255) |
|
COLOR_RED = (0, 0, 255) |
|
BACKGROUND_FOR_COLOR_RED = (255, 255, 0) |
|
COLOR_YELLOW = (0, 255, 255) |
|
BACKGROUND_FOR_COLOR_YELLOW = (255, 0, 0) |
|
COLOR_WHITE = (255, 255, 255) |
|
BACKGROUND_FOR_COLOR_WHITE = (128, 128, 128) |
|
COLOR_BLACK = (0, 0, 0) |
|
BACKGROUND_FOR_COLOR_BLACK = (128, 128, 128) |
|
COLOR_BROWN = (202, 221, 234) |
|
BACKGROUND_FOR_COLOR_BROWN = (234, 215, 202) |
|
COLOR_MAGENTA = (255, 0, 255) |
|
BACKGROUND_FOR_COLOR_MAGENTA = (0, 255, 0) |
|
COLOR_ORANGE = (0, 165, 255) |
|
BACKGROUND_FOR_COLOR_ORANGE = (255, 90, 0) |
|
COLOR_PURPLE = (128, 0, 128) |
|
BACKGROUND_FOR_COLOR_PURPLE = (127, 255, 127) |
|
COLOR_GRAY = (128, 128, 128) |
|
|
|
def replace_characters_that_opencv_cant_show(text): |
|
text = text.replace("á", "a") |
|
text = text.replace("é", "e") |
|
text = text.replace("í", "i") |
|
text = text.replace("ó", "o") |
|
text = text.replace("ú", "u") |
|
text = text.replace("ñ", "nh") |
|
text = text.replace("Á", "A") |
|
text = text.replace("É", "E") |
|
text = text.replace("Í", "I") |
|
text = text.replace("Ó", "O") |
|
text = text.replace("Ú", "U") |
|
text = text.replace("Ñ", "NH") |
|
text = text.replace("\n", "") |
|
text = text.replace("¿", "?") |
|
text = text.replace("¡", "!") |
|
return text |
|
|
|
def remove_speaker_text(text): |
|
|
|
match = re.match(r"^\[SPEAKER_\d+\]:\s", text) |
|
speaker = None |
|
if match: |
|
speaker = int(match.group(0)[9:11]) |
|
prefix_len = len(match.group(0)) |
|
text = text[prefix_len:] |
|
return text, speaker |
|
|
|
def get_filter_text_and_speaker(text, color, background): |
|
text, speaker = remove_speaker_text(text) |
|
if speaker is not None: |
|
if speaker == 0: |
|
color = COLOR_GREEN |
|
background = BACKGROUND_FOR_COLOR_GREEN |
|
elif speaker == 1: |
|
color = COLOR_BLUE |
|
background = BACKGROUND_FOR_COLOR_BLUE |
|
elif speaker == 2: |
|
color = COLOR_RED |
|
background = BACKGROUND_FOR_COLOR_RED |
|
elif speaker == 3: |
|
color = COLOR_YELLOW |
|
background = BACKGROUND_FOR_COLOR_YELLOW |
|
elif speaker == 4: |
|
color = COLOR_WHITE |
|
background = BACKGROUND_FOR_COLOR_WHITE |
|
elif speaker == 5: |
|
color = COLOR_BLACK |
|
background = BACKGROUND_FOR_COLOR_BLACK |
|
elif speaker == 6: |
|
color = COLOR_BROWN |
|
background = BACKGROUND_FOR_COLOR_BROWN |
|
elif speaker == 7: |
|
color = COLOR_MAGENTA |
|
background = BACKGROUND_FOR_COLOR_MAGENTA |
|
elif speaker == 8: |
|
color = COLOR_ORANGE |
|
background = BACKGROUND_FOR_COLOR_ORANGE |
|
elif speaker == 9: |
|
color = COLOR_PURPLE |
|
background = BACKGROUND_FOR_COLOR_PURPLE |
|
return text, color, background |
|
|
|
def create_dict_of_transcription(transcription_file): |
|
transcription_dict = {} |
|
|
|
with open(transcription_file, "r") as f: |
|
transcriptions = f.read().splitlines() |
|
|
|
for line in transcriptions: |
|
|
|
|
|
if re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line): |
|
|
|
start, end = line.split(" --> ") |
|
|
|
transcription_dict[start] = {"transcription": "", "end": end} |
|
|
|
|
|
elif re.match(r"\d+$", line): |
|
continue |
|
|
|
|
|
elif re.match(r"^$", line): |
|
continue |
|
|
|
|
|
else: |
|
|
|
line = replace_characters_that_opencv_cant_show(line) |
|
transcription_dict[start]["transcription"] += f"{line}\n" |
|
|
|
return transcription_dict |
|
|
|
def hour_minute_seconds_miliseconds_to_seconds(time): |
|
if time is None: |
|
return None |
|
hours, minutes, seconds_miliseconds = time.split(":") |
|
seconds, miliseconds = seconds_miliseconds.split(",") |
|
seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000 |
|
return seconds |
|
|
|
def seconds_to_hour_minute_seconds_miliseconds(seconds): |
|
miliseconds = str(seconds).split(".")[1] |
|
miliseconds = f"0.{miliseconds}" |
|
miliseconds = float(miliseconds) |
|
miliseconds = int(miliseconds * 1000) |
|
hours = int(seconds) // 3600 |
|
minutes = (int(seconds) % 3600) // 60 |
|
seconds = (int(seconds) % 3600) % 60 |
|
time = f"{hours:02d}:{minutes:02d}:{seconds:02d},{miliseconds:03d}" |
|
return time |
|
|
|
def search_transcription_in_dict_of_transcription(transcription_dict, seconds): |
|
|
|
keys = list(transcription_dict.keys()) |
|
|
|
|
|
for i in range(len(keys)-1): |
|
key_hmsms = keys[i] |
|
next_key_hmsms = keys[i+1] |
|
key_seconds = hour_minute_seconds_miliseconds_to_seconds(key_hmsms) |
|
next_key_seconds = hour_minute_seconds_miliseconds_to_seconds(next_key_hmsms) |
|
if key_seconds <= seconds and seconds < next_key_seconds: |
|
transcription = transcription_dict[key_hmsms]["transcription"] |
|
end_time = transcription_dict[key_hmsms]["end"] |
|
return transcription, end_time |
|
else: |
|
continue |
|
return None, None |
|
|
|
def get_length_of_cv2_text(text, fontFace, fontScale, thickness): |
|
text_size, _ = cv2.getTextSize(text, fontFace, fontScale, thickness) |
|
return text_size[0] |
|
|
|
def add_subtitles_to_video(transcription_dict, input_video_file): |
|
|
|
input_video, input_video_extension = input_video_file.split(".") |
|
input_video_folder, input_video_name = input_video.split("/") |
|
output_video_folder = input_video_folder |
|
output_video_name = input_video_name + "_with_subtitles" |
|
output_video_extension = input_video_extension |
|
output_video_file = f"{output_video_folder}/{output_video_name}.{output_video_extension}" |
|
|
|
|
|
captured_video = cv2.VideoCapture(input_video_file) |
|
captured_video_fps = captured_video.get(cv2.CAP_PROP_FPS) |
|
captured_video_width = captured_video.get(cv2.CAP_PROP_FRAME_WIDTH) |
|
captured_video_height = captured_video.get(cv2.CAP_PROP_FRAME_HEIGHT) |
|
num_frames = int(captured_video.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
|
|
|
progress_bar = tqdm(total=num_frames, desc="Add subtitles to video progress") |
|
|
|
|
|
if not DEBUG: |
|
fourcc = cv2.VideoWriter_fourcc(*'mp4v') |
|
video = cv2.VideoWriter(output_video_file, fourcc, captured_video_fps, (int(captured_video_width), int(captured_video_height))) |
|
|
|
|
|
fontFace = cv2.FONT_HERSHEY_DUPLEX |
|
fontScale = 1 |
|
thickness = 2 |
|
color = COLOR_WHITE |
|
background = COLOR_GRAY |
|
lineType = cv2.LINE_AA |
|
bottomLeftOrigin = False |
|
|
|
old_text = "" |
|
while captured_video.isOpened(): |
|
|
|
ret, frame = captured_video.read() |
|
if not ret: |
|
break |
|
|
|
|
|
current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000 |
|
text, end_time = search_transcription_in_dict_of_transcription(transcription_dict, current_time) |
|
if text is not None: |
|
if len(text) > 0: |
|
if text[-1] == "\n": |
|
text = text[:-1] |
|
if text[-1] == " ": |
|
text = text[:-1] |
|
if old_text != text: |
|
old_text = text |
|
text_length = get_length_of_cv2_text(text, fontFace, fontScale, thickness) |
|
current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000 |
|
end_time_seconds = hour_minute_seconds_miliseconds_to_seconds(end_time) |
|
if current_time is not None and end_time_seconds is not None: |
|
if current_time <= end_time_seconds: |
|
if text_length > captured_video_width: |
|
necesary_rows = int(text_length // (captured_video_width-300)+1) |
|
words = text.split(" ") |
|
number_of_words = len(words) |
|
words_per_row = int(number_of_words // necesary_rows) |
|
text = "" |
|
text_position = (50, int(captured_video_height)-50*(necesary_rows+1)) |
|
rectangle_point1 = (40, text_position[1]-30) |
|
for i in range(number_of_words): |
|
if i % words_per_row == 0 and i != 0: |
|
text, color, background = get_filter_text_and_speaker(text, color, background) |
|
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness) |
|
if length_of_text > 10: |
|
rectangle_point2 = (length_of_text+50, text_position[1]+10) |
|
cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0) |
|
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin) |
|
text = "" |
|
text_position = (50, text_position[1]+50) |
|
rectangle_point1 = (40, text_position[1]-30) |
|
text += words[i] + " " |
|
|
|
text, color, background = get_filter_text_and_speaker(text, color, background) |
|
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness) |
|
if length_of_text > 10: |
|
rectangle_point2 = (length_of_text+50, text_position[1]+10) |
|
cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0) |
|
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin) |
|
else: |
|
text_position = (50, int(captured_video_height)-50) |
|
rectangle_point1 = (40, text_position[1]-30) |
|
rectangle_point2 = (int(captured_video_width)-50, text_position[1]+10) |
|
if text is not None: |
|
text, color, background = get_filter_text_and_speaker(text, color, background) |
|
length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness) |
|
if length_of_text > 10: |
|
rectangle_point2 = (length_of_text+50, text_position[1]+10) |
|
cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0) |
|
cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin) |
|
|
|
|
|
progress_bar.update(1) |
|
|
|
|
|
if DEBUG: |
|
cv2.imshow('frame', frame) |
|
|
|
cv2.namedWindow("frame", cv2.WINDOW_NORMAL) |
|
cv2.resizeWindow("frame", 520, 293) |
|
if cv2.waitKey(1) & 0xFF == ord('q'): |
|
break |
|
|
|
|
|
if not DEBUG: |
|
video.write(frame) |
|
|
|
|
|
captured_video.release() |
|
|
|
|
|
cv2.destroyAllWindows() |
|
|
|
|
|
if not DEBUG: |
|
video.release() |
|
|
|
|
|
if not DEBUG: |
|
progress_bar = tqdm(total=3, desc="Add audio to video progress") |
|
command = f"ffmpeg -i {output_video_file} -i {input_audio_file} -c:v copy -c:a aac -strict experimental -loglevel warning {output_video_file}_with_audio.{output_video_extension}" |
|
os.system(command) |
|
progress_bar.update(1) |
|
command = f"rm {output_video_file}" |
|
os.system(command) |
|
progress_bar.update(1) |
|
command = f"mv {output_video_file}_with_audio.{output_video_extension} {output_video_file}" |
|
os.system(command) |
|
progress_bar.update(1) |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("transcription_file", help="Transcribed text") |
|
parser.add_argument("input_video_file", help="Input video file") |
|
parser.add_argument("input_audio_file", help="Input audio file") |
|
args = parser.parse_args() |
|
|
|
transcription_file = args.transcription_file |
|
input_video_file = args.input_video_file |
|
input_audio_file = args.input_audio_file |
|
|
|
transcription_dict = create_dict_of_transcription(transcription_file) |
|
|
|
|
|
|
|
|
|
add_subtitles_to_video(transcription_dict, input_video_file) |