Spaces:

BluescarfAI
/

Video-Explaination-Agent

Sleeping

App Files Files Community

Video-Explaination-Agent / app.py

geekphoenix

Update app.py

1a67653 verified 8 months ago

raw

history blame contribute delete

4.89 kB

	# #!/usr/bin/env python3


	import gradio as gr
	import cv2
	import base64
	import time
	import os
	import json
	import sys
	from openai import OpenAI
	from dotenv import load_dotenv
	from gtts import gTTS
	import tempfile # To handle temporary files for Gradio uploads

	# Load environment variables from .env file (for local testing)
	load_dotenv()

	def generate_explanation(video_file_path, prompt_text, openai_api_key_input):
	"""
	Processes a video, generates an explanation using OpenAI, and converts it to audio.
	This function is designed to be called by Gradio.
	"""

	# Prioritize API key from environment variables (Hugging Face Secrets)
	# If not found, use the key provided in the Gradio UI.
	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	api_key = openai_api_key_input
	if not api_key or api_key == "<your OpenAI API key if not set as env var>":
	return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None

	client = OpenAI(api_key=api_key)
	print(f"Video file path: {video_file_path}")
	if not video_file_path:
	return "Error: Please upload a video file.", None
	if not prompt_text:
	return "Error: Please provide an explanation prompt.", None


	# Open the video file
	video = cv2.VideoCapture(video_file_path)
	if not video.isOpened():
	return f"Error: Failed to open video file: {video_file_path}", None

	# Extract frames from video
	base64Frames = []
	frame_count = 0
	total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))

	# Calculate sampling rate to get around 20-30 frames total
	if total_frames > 0:
	sampling_rate = max(1, total_frames // 25)
	else:
	sampling_rate = 50 # Default fallback if total_frames is 0

	while video.isOpened():
	success, frame = video.read()
	if not success:
	break

	# Only take every Nth frame to reduce processing
	if frame_count % sampling_rate == 0:
	_, buffer = cv2.imencode(".jpg", frame)
	base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

	frame_count += 1

	video.release()
	print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.")


	PROMPT_MESSAGES = [
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you"
	},
	*[
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{frame}",
	"detail": "low"
	}
	} for frame in base64Frames
	]
	],
	},
	]

	params = {
	"model": "gpt-4o-mini",
	"messages": PROMPT_MESSAGES,
	"max_tokens": 500,
	}

	explanation = ""
	try:
	result = client.chat.completions.create(**params)
	explanation = result.choices[0].message.content
	print("Generated explanation based on provided prompt.")
	except Exception as e:
	return f"Error generating explanation: {str(e)}", None


	# Generate audio from the explanation
	# Use tempfile to create a temporary audio file that Gradio can serve
	try:
	with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
	tts = gTTS(text=explanation, lang='en')
	tts.save(temp_audio_file.name)
	audio_path = temp_audio_file.name
	print("Generated audio file.")
	except Exception as e:
	return f"Error generating audio: {str(e)}", None

	return explanation, audio_path

	# Create the Gradio Interface
	iface = gr.Interface(
	fn=generate_explanation,
	inputs=[
	gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]),

	gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5),
	gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
	],
	outputs=[
	gr.Textbox(label="Generated Explanation", lines=10),
	gr.Audio(label="Explanation Audio", type="filepath")
	],
	title="Video Explanation Agent ",
	description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.",
	)

	if __name__ == "__main__":
	iface.launch()