from os import path as osp import json import cv2 import webvtt from utils import maintain_aspect_ratio_resize, str2time def extract_and_save_frames_and_metadata( path_to_video, path_to_transcript, path_to_save_extracted_frames, path_to_save_metadatas): # metadatas will store the metadata of all extracted frames metadatas = [] # load video using cv2 video = cv2.VideoCapture(path_to_video) # load transcript using webvtt trans = webvtt.read(path_to_transcript) # iterate transcript file # for each video segment specified in the transcript file for idx, transcript in enumerate(trans): # get the start time and end time in seconds start_time_ms = str2time(transcript.start) end_time_ms = str2time(transcript.end) # get the time in ms exactly # in the middle of start time and end time mid_time_ms = (end_time_ms + start_time_ms) / 2 # get the transcript, remove the next-line symbol text = transcript.text.replace("\n", ' ') # get frame at the middle time video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms) success, frame = video.read() if success: # if the frame is extracted successfully, resize it image = maintain_aspect_ratio_resize(frame, height=350) # save frame as JPEG file img_fname = f'frame_{idx}.jpg' img_fpath = osp.join( path_to_save_extracted_frames, img_fname ) cv2.imwrite(img_fpath, image) # prepare the metadata metadata = { 'extracted_frame_path': img_fpath, 'transcript': text, 'video_segment_id': idx, 'video_path': path_to_video, 'mid_time_ms': mid_time_ms, } metadatas.append(metadata) else: print(f"ERROR! Cannot extract frame: idx = {idx}") # save metadata of all extracted frames fn = osp.join(path_to_save_metadatas, 'metadatas.json') with open(fn, 'w') as outfile: json.dump(metadatas, outfile) return metadatas