import cv2 import streamlit as st import tempfile import base64 import os from dotenv import load_dotenv from openai import OpenAI import assemblyai as aai from moviepy.editor import * # Load environment variables load_dotenv() aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY") OpenAI.api_key = os.getenv("OPENAI_API_KEY") client = OpenAI() def main(): st.title('Insightly Video Content Moderation') # Video upload section uploaded_video = st.file_uploader('Upload a video', type=["mp4", "avi", "mov"]) if uploaded_video is not None: # Save the video to a temp file tfile = tempfile.NamedTemporaryFile(delete=False) tfile.write(uploaded_video.read()) video_file_path = tfile.name tfile.close() transcriber = aai.Transcriber() transcript = transcriber.transcribe(tfile.name) # Process the video and display frames in a grid layout base64_frames = video_to_base64_frames(video_file_path) display_frame_grid(base64_frames[::30]) # Display every 30th frame in a 3-column grid st.write("Actions:") # Header for the actions/buttons section # Creating four columns to align the buttons col1, col2, col3, col4 = st.columns(4) with col1: if st.button("Description"): st.session_state['description'] = generate_description(base64_frames) if 'description' not in st.session_state else st.session_state['description'] with col2: if st.button("Frame Description"): st.session_state['frame_description'] = generate_frame_description(base64_frames) if 'frame_description' not in st.session_state else st.session_state['frame_description'] with col3: if st.button("Generate Transcript"): st.session_state['transcript'] = transcript.text if 'transcript' not in st.session_state else st.session_state['transcript'] with col4: if st.button("Category of Video"): st.session_state['category'] = generate_category(base64_frames) if 'category' not in st.session_state else st.session_state['category'] # If any value exists in session state then display it if 'description' in st.session_state and st.session_state['description']: st.subheader("Video Description") st.write(st.session_state['description']) if 'frame_description' in st.session_state and st.session_state['frame_description']: st.subheader("Frame Description") st.write(st.session_state['frame_description']) if 'transcript' in st.session_state and st.session_state['transcript']: st.subheader("Video Transcript") st.write(st.session_state['transcript']) if 'category' in st.session_state and st.session_state['category']: st.subheader("Video Category") st.write(st.session_state['category']) def video_to_base64_frames(video_file_path): # Logic to extract all frames from the video and convert them to base64 video = cv2.VideoCapture(video_file_path) base64_frames = [] while video.isOpened(): success, frame = video.read() if not success: break _, buffer = cv2.imencode('.jpg', frame) base64_frame = base64.b64encode(buffer).decode('utf-8') base64_frames.append(base64_frame) video.release() return base64_frames ######################################### #Generate Video description def generate_description(base64_frames): prompt_messages = [ { "role": "user", "content": [ "1. Generate a description for this sequence of video frames in about 90 words.\ Return the following : 1. List of objects in the video 2. Any restrictive content or sensitive content and if so which frame.", *map(lambda x: {"image": x, "resize": 428}, base64_frames[0::30]), ], }, ] response = client.chat.completions.create( model="gpt-4-vision-preview", messages=prompt_messages, max_tokens=3000, ) return response.choices[0].message.content #Generate frame description def generate_frame_description(base64_frames): prompt_messages = [ { "role": "user", "content": [ "Describe what is happening in each frame.", *map(lambda x: {"image": x, "resize": 428}, base64_frames[0::30]), ], }, ] response = client.chat.completions.create( model="gpt-4-vision-preview", messages=prompt_messages, max_tokens=3000, ) return response.choices[0].message.content #Generate Category of Video def generate_category(base64_frames): prompt_messages = [ { "role": "user", "content": [ "What category can this video be tagged to?", *map(lambda x: {"image": x, "resize": 428}, base64_frames[0::30]), ], }, ] response = client.chat.completions.create( model="gpt-4-vision-preview", messages=prompt_messages, max_tokens=3000, ) return response.choices[0].message.content ######################## def display_frame_grid(base64_frames): cols_per_row = 3 n_frames = len(base64_frames) for idx in range(0, n_frames, cols_per_row): cols = st.columns(cols_per_row) for col_index in range(cols_per_row): frame_idx = idx + col_index if frame_idx < n_frames: with cols[col_index]: frame = base64_frames[frame_idx] st.image(base64.b64decode(frame), caption=f'Frame {frame_idx * 30 + 1}', width=200) if __name__ == '__main__': main()