import argparse import cv2 import re from tqdm import tqdm import os # https://github.com/Zulko/moviepy/issues/401#issuecomment-278679961 DEBUG = False COLOR_BLUE = (255, 0, 0) BACKGROUND_FOR_COLOR_BLUE = (255, 255, 0) COLOR_GREEN = (0, 255, 0) BACKGROUND_FOR_COLOR_GREEN = (255, 0, 255) COLOR_RED = (0, 0, 255) BACKGROUND_FOR_COLOR_RED = (255, 255, 0) COLOR_YELLOW = (0, 255, 255) BACKGROUND_FOR_COLOR_YELLOW = (255, 0, 0) COLOR_WHITE = (255, 255, 255) BACKGROUND_FOR_COLOR_WHITE = (128, 128, 128) COLOR_BLACK = (0, 0, 0) BACKGROUND_FOR_COLOR_BLACK = (128, 128, 128) COLOR_BROWN = (202, 221, 234) BACKGROUND_FOR_COLOR_BROWN = (234, 215, 202) COLOR_MAGENTA = (255, 0, 255) BACKGROUND_FOR_COLOR_MAGENTA = (0, 255, 0) COLOR_ORANGE = (0, 165, 255) BACKGROUND_FOR_COLOR_ORANGE = (255, 90, 0) COLOR_PURPLE = (128, 0, 128) BACKGROUND_FOR_COLOR_PURPLE = (127, 255, 127) COLOR_GRAY = (128, 128, 128) def replace_characters_that_opencv_cant_show(text): text = text.replace("á", "a") text = text.replace("é", "e") text = text.replace("í", "i") text = text.replace("ó", "o") text = text.replace("ú", "u") text = text.replace("ñ", "nh") text = text.replace("Á", "A") text = text.replace("É", "E") text = text.replace("Í", "I") text = text.replace("Ó", "O") text = text.replace("Ú", "U") text = text.replace("Ñ", "NH") text = text.replace("\n", "") text = text.replace("¿", "?") text = text.replace("¡", "!") return text def remove_speaker_text(text): # If text start with "[SPEAKER_XX]: " remove it match = re.match(r"^\[SPEAKER_\d+\]:\s", text) speaker = None if match: speaker = int(match.group(0)[9:11]) prefix_len = len(match.group(0)) # Get length of the matched text text = text[prefix_len:] # Remove the matched text from the beginning return text, speaker def get_filter_text_and_speaker(text, color, background): text, speaker = remove_speaker_text(text) if speaker is not None: if speaker == 0: color = COLOR_GREEN background = BACKGROUND_FOR_COLOR_GREEN elif speaker == 1: color = COLOR_BLUE background = BACKGROUND_FOR_COLOR_BLUE elif speaker == 2: color = COLOR_RED background = BACKGROUND_FOR_COLOR_RED elif speaker == 3: color = COLOR_YELLOW background = BACKGROUND_FOR_COLOR_YELLOW elif speaker == 4: color = COLOR_WHITE background = BACKGROUND_FOR_COLOR_WHITE elif speaker == 5: color = COLOR_BLACK background = BACKGROUND_FOR_COLOR_BLACK elif speaker == 6: color = COLOR_BROWN background = BACKGROUND_FOR_COLOR_BROWN elif speaker == 7: color = COLOR_MAGENTA background = BACKGROUND_FOR_COLOR_MAGENTA elif speaker == 8: color = COLOR_ORANGE background = BACKGROUND_FOR_COLOR_ORANGE elif speaker == 9: color = COLOR_PURPLE background = BACKGROUND_FOR_COLOR_PURPLE return text, color, background def create_dict_of_transcription(transcription_file): transcription_dict = {} with open(transcription_file, "r") as f: transcriptions = f.read().splitlines() for line in transcriptions: # if line is dd:dd:dd,ddd --> dd:dd:dd,ddd (start and end time) add a new key to the dictionary if re.match(r"\d\d:\d\d:\d\d,\d+ --> \d\d:\d\d:\d\d,\d+", line): # Get start time (dd:dd:dd,ddd) and end time (dd:dd:dd,ddd) start, end = line.split(" --> ") # Add key to dictionary transcription_dict[start] = {"transcription": "", "end": end} # if line is a number and carriage continue elif re.match(r"\d+$", line): continue # if line is a carriage return continue elif re.match(r"^$", line): continue # if line is a transcription add it to the dictionary else: # Remove characters that opencv can't show line = replace_characters_that_opencv_cant_show(line) transcription_dict[start]["transcription"] += f"{line}\n" return transcription_dict def hour_minute_seconds_miliseconds_to_seconds(time): if time is None: return None hours, minutes, seconds_miliseconds = time.split(":") seconds, miliseconds = seconds_miliseconds.split(",") seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(miliseconds) / 1000 return seconds def seconds_to_hour_minute_seconds_miliseconds(seconds): miliseconds = str(seconds).split(".")[1] miliseconds = f"0.{miliseconds}" miliseconds = float(miliseconds) miliseconds = int(miliseconds * 1000) hours = int(seconds) // 3600 minutes = (int(seconds) % 3600) // 60 seconds = (int(seconds) % 3600) % 60 time = f"{hours:02d}:{minutes:02d}:{seconds:02d},{miliseconds:03d}" return time def search_transcription_in_dict_of_transcription(transcription_dict, seconds): # Get list of keys keys = list(transcription_dict.keys()) # Search the key in the dictionary for i in range(len(keys)-1): key_hmsms = keys[i] next_key_hmsms = keys[i+1] key_seconds = hour_minute_seconds_miliseconds_to_seconds(key_hmsms) next_key_seconds = hour_minute_seconds_miliseconds_to_seconds(next_key_hmsms) if key_seconds <= seconds and seconds < next_key_seconds: transcription = transcription_dict[key_hmsms]["transcription"] end_time = transcription_dict[key_hmsms]["end"] return transcription, end_time else: continue return None, None def get_length_of_cv2_text(text, fontFace, fontScale, thickness): text_size, _ = cv2.getTextSize(text, fontFace, fontScale, thickness) return text_size[0] def add_subtitles_to_video(transcription_dict, input_video_file): # Get the name of the input and output video files input_video, input_video_extension = input_video_file.split(".") input_video_folder, input_video_name = input_video.split("/") output_video_folder = input_video_folder output_video_name = input_video_name + "_with_subtitles" output_video_extension = input_video_extension output_video_file = f"{output_video_folder}/{output_video_name}.{output_video_extension}" # Open the input video file captured_video = cv2.VideoCapture(input_video_file) captured_video_fps = captured_video.get(cv2.CAP_PROP_FPS) captured_video_width = captured_video.get(cv2.CAP_PROP_FRAME_WIDTH) captured_video_height = captured_video.get(cv2.CAP_PROP_FRAME_HEIGHT) num_frames = int(captured_video.get(cv2.CAP_PROP_FRAME_COUNT)) # Progress bar progress_bar = tqdm(total=num_frames, desc="Add subtitles to video progress") # Video writer if not DEBUG: fourcc = cv2.VideoWriter_fourcc(*'mp4v') video = cv2.VideoWriter(output_video_file, fourcc, captured_video_fps, (int(captured_video_width), int(captured_video_height))) # Set font properties fontFace = cv2.FONT_HERSHEY_DUPLEX fontScale = 1 thickness = 2 color = COLOR_WHITE background = COLOR_GRAY lineType = cv2.LINE_AA bottomLeftOrigin = False old_text = "" while captured_video.isOpened(): # Read the next frame ret, frame = captured_video.read() if not ret: break # Add the text to the frame current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000 text, end_time = search_transcription_in_dict_of_transcription(transcription_dict, current_time) if text is not None: if len(text) > 0: if text[-1] == "\n": text = text[:-1] if text[-1] == " ": text = text[:-1] if old_text != text: old_text = text text_length = get_length_of_cv2_text(text, fontFace, fontScale, thickness) current_time = captured_video.get(cv2.CAP_PROP_POS_MSEC) / 1000 end_time_seconds = hour_minute_seconds_miliseconds_to_seconds(end_time) if current_time is not None and end_time_seconds is not None: if current_time <= end_time_seconds: if text_length > captured_video_width: necesary_rows = int(text_length // (captured_video_width-300)+1) words = text.split(" ") number_of_words = len(words) words_per_row = int(number_of_words // necesary_rows) text = "" text_position = (50, int(captured_video_height)-50*(necesary_rows+1)) rectangle_point1 = (40, text_position[1]-30) for i in range(number_of_words): if i % words_per_row == 0 and i != 0: text, color, background = get_filter_text_and_speaker(text, color, background) length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness) if length_of_text > 10: rectangle_point2 = (length_of_text+50, text_position[1]+10) cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0) cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin) text = "" text_position = (50, text_position[1]+50) rectangle_point1 = (40, text_position[1]-30) text += words[i] + " " # Add the last words text, color, background = get_filter_text_and_speaker(text, color, background) length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness) if length_of_text > 10: rectangle_point2 = (length_of_text+50, text_position[1]+10) cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0) cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin) else: text_position = (50, int(captured_video_height)-50) rectangle_point1 = (40, text_position[1]-30) rectangle_point2 = (int(captured_video_width)-50, text_position[1]+10) if text is not None: text, color, background = get_filter_text_and_speaker(text, color, background) length_of_text = get_length_of_cv2_text(text, fontFace, fontScale, thickness) if length_of_text > 10: rectangle_point2 = (length_of_text+50, text_position[1]+10) cv2.rectangle(frame, rectangle_point1, rectangle_point2, background, -1, cv2.LINE_AA, 0) cv2.putText(frame, text, text_position, fontFace, fontScale, color, thickness, lineType, bottomLeftOrigin) # Update the progress bar progress_bar.update(1) # Show the frame if DEBUG: cv2.imshow('frame', frame) # Set window 520x293 cv2.namedWindow("frame", cv2.WINDOW_NORMAL) cv2.resizeWindow("frame", 520, 293) if cv2.waitKey(1) & 0xFF == ord('q'): break # Write the frame if not DEBUG: video.write(frame) # Release the video capture object captured_video.release() # Close all the frames cv2.destroyAllWindows() # Release the video writer object if not DEBUG: video.release() # Add audio to the video if not DEBUG: progress_bar = tqdm(total=3, desc="Add audio to video progress") command = f"ffmpeg -i {output_video_file} -i {input_audio_file} -c:v copy -c:a aac -strict experimental -loglevel warning {output_video_file}_with_audio.{output_video_extension}" os.system(command) progress_bar.update(1) command = f"rm {output_video_file}" os.system(command) progress_bar.update(1) command = f"mv {output_video_file}_with_audio.{output_video_extension} {output_video_file}" os.system(command) progress_bar.update(1) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("transcription_file", help="Transcribed text") parser.add_argument("input_video_file", help="Input video file") parser.add_argument("input_audio_file", help="Input audio file") args = parser.parse_args() transcription_file = args.transcription_file input_video_file = args.input_video_file input_audio_file = args.input_audio_file transcription_dict = create_dict_of_transcription(transcription_file) # for key in transcription_dict.keys(): # print(key) # print(transcription_dict[key]) # print("\n\n") add_subtitles_to_video(transcription_dict, input_video_file)