import argparse import csv import os import requests import tqdm from .utils import extract_frames, prompts, read_video_list def get_caption(frame, prompt, api_key): headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} payload = { "model": "gpt-4-vision-preview", "messages": [ { "role": "user", "content": [ { "type": "text", "text": prompt, }, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}}, ], } ], "max_tokens": 300, } response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60) caption = response.json()["choices"][0]["message"]["content"] caption = caption.replace("\n", " ") return caption def main(args): # ====================================================== # 1. read video list # ====================================================== videos = read_video_list(args.video_folder, args.output_file) f = open(args.output_file, "a") writer = csv.writer(f) # ====================================================== # 2. generate captions # ====================================================== for video in tqdm.tqdm(videos): video_path = os.path.join(args.video_folder, video) frame, length = extract_frames(video_path, base_64=True) if len(frame) < 3: continue prompt = prompts[args.prompt] caption = get_caption(frame, prompt, args.key) writer.writerow((video, caption, length)) f.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("video_folder", type=str) parser.add_argument("output_file", type=str) parser.add_argument("--prompt", type=str, default="three_frames") parser.add_argument("--key", type=str) args = parser.parse_args() main(args)