| | |
| |
|
| |
|
| | import gradio as gr |
| | import cv2 |
| | import base64 |
| | import time |
| | import os |
| | import json |
| | import sys |
| | from openai import OpenAI |
| | from dotenv import load_dotenv |
| | from gtts import gTTS |
| | import tempfile |
| |
|
| | |
| | load_dotenv() |
| |
|
| | def generate_explanation(video_file_path, prompt_text, openai_api_key_input): |
| | """ |
| | Processes a video, generates an explanation using OpenAI, and converts it to audio. |
| | This function is designed to be called by Gradio. |
| | """ |
| | |
| | |
| | |
| | api_key = os.getenv("OPENAI_API_KEY") |
| | if not api_key: |
| | api_key = openai_api_key_input |
| | if not api_key or api_key == "<your OpenAI API key if not set as env var>": |
| | return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None |
| |
|
| | client = OpenAI(api_key=api_key) |
| | print(f"Video file path: {video_file_path}") |
| | if not video_file_path: |
| | return "Error: Please upload a video file.", None |
| | if not prompt_text: |
| | return "Error: Please provide an explanation prompt.", None |
| | |
| |
|
| | |
| | video = cv2.VideoCapture(video_file_path) |
| | if not video.isOpened(): |
| | return f"Error: Failed to open video file: {video_file_path}", None |
| |
|
| | |
| | base64Frames = [] |
| | frame_count = 0 |
| | total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) |
| | |
| | |
| | if total_frames > 0: |
| | sampling_rate = max(1, total_frames // 25) |
| | else: |
| | sampling_rate = 50 |
| |
|
| | while video.isOpened(): |
| | success, frame = video.read() |
| | if not success: |
| | break |
| | |
| | |
| | if frame_count % sampling_rate == 0: |
| | _, buffer = cv2.imencode(".jpg", frame) |
| | base64Frames.append(base64.b64encode(buffer).decode("utf-8")) |
| | |
| | frame_count += 1 |
| |
|
| | video.release() |
| | print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.") |
| |
|
| | |
| | PROMPT_MESSAGES = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "text", |
| | "text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you" |
| | }, |
| | *[ |
| | { |
| | "type": "image_url", |
| | "image_url": { |
| | "url": f"data:image/jpeg;base64,{frame}", |
| | "detail": "low" |
| | } |
| | } for frame in base64Frames |
| | ] |
| | ], |
| | }, |
| | ] |
| |
|
| | params = { |
| | "model": "gpt-4o-mini", |
| | "messages": PROMPT_MESSAGES, |
| | "max_tokens": 500, |
| | } |
| |
|
| | explanation = "" |
| | try: |
| | result = client.chat.completions.create(**params) |
| | explanation = result.choices[0].message.content |
| | print("Generated explanation based on provided prompt.") |
| | except Exception as e: |
| | return f"Error generating explanation: {str(e)}", None |
| |
|
| |
|
| | |
| | |
| | try: |
| | with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file: |
| | tts = gTTS(text=explanation, lang='en') |
| | tts.save(temp_audio_file.name) |
| | audio_path = temp_audio_file.name |
| | print("Generated audio file.") |
| | except Exception as e: |
| | return f"Error generating audio: {str(e)}", None |
| |
|
| | return explanation, audio_path |
| |
|
| | |
| | iface = gr.Interface( |
| | fn=generate_explanation, |
| | inputs=[ |
| | gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]), |
| | |
| | gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5), |
| | gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") |
| | ], |
| | outputs=[ |
| | gr.Textbox(label="Generated Explanation", lines=10), |
| | gr.Audio(label="Explanation Audio", type="filepath") |
| | ], |
| | title="Video Explanation Agent ", |
| | description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.", |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | iface.launch() |
| |
|