import streamlit as st import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import torch import json import os from pathlib import Path class VideoRetrieval: def __init__(self, use_dummy_data=True): self.text_model = SentenceTransformer('all-MiniLM-L6-v2') if use_dummy_data: self.create_dummy_data() else: self.load_data() def create_dummy_data(self): """Create dummy features and metadata for demonstration""" # Create dummy features n_clips = 20 feature_dim = 384 # matching the dimension of all-MiniLM-L6-v2 self.features = { 'visual_features': np.random.randn(n_clips, feature_dim), 'scene_features': np.random.randn(n_clips, feature_dim), 'object_features': np.random.randn(n_clips, feature_dim) } # Create dummy metadata movie_titles = [ "The Matrix", "Inception", "The Dark Knight", "Pulp Fiction", "The Shawshank Redemption", "Forrest Gump", "The Godfather", "Fight Club", "Interstellar", "The Silence of the Lambs" ] descriptions = [ "A dramatic confrontation in a dark room where the truth is revealed", "A high-stakes chase through a crowded city street", "An emotional reunion between long-lost friends", "A tense negotiation that determines the fate of many", "A quiet moment of reflection before a life-changing decision" ] # Sample YouTube clips (famous movie scenes) youtube_clips = [ "https://www.youtube.com/watch?v=kcsNbQRU5TI", # Matrix - Red Pill Blue Pill "https://www.youtube.com/watch?v=YoHD9XEInc0", # Inception - Hallway Fight "https://www.youtube.com/watch?v=ZWCAf-xLV2k", # Dark Knight - Interrogation "https://www.youtube.com/watch?v=Jomr9SAjcyw", # Pulp Fiction - Restaurant "https://www.youtube.com/watch?v=SQ7_5MMbPYs", # Shawshank - Hope Speech ] data = [] for i in range(n_clips): data.append({ 'clip_id': f'clip_{i}', 'movie_title': movie_titles[i % len(movie_titles)], 'description': descriptions[i % len(descriptions)], 'timestamp': f'{(i*5):02d}:00 - {(i*5+3):02d}:00', 'duration': '3:00', 'youtube_url': youtube_clips[i % len(youtube_clips)] }) self.clips_df = pd.DataFrame(data) def load_data(self): """Load actual pre-computed features and metadata""" try: self.features = { 'visual_features': np.load('path_to_visual_features.npy'), 'scene_features': np.load('path_to_scene_features.npy'), 'object_features': np.load('path_to_object_features.npy') } self.clips_df = pd.read_csv('clips_metadata.csv') except FileNotFoundError as e: st.error(f"Error loading data: {e}. Falling back to dummy data.") self.create_dummy_data() def encode_query(self, query_text): """Encode the text query into embeddings""" return self.text_model.encode(query_text) def compute_similarity(self, query_embedding, feature_type='visual_features'): """Compute similarity between query and video features""" similarities = cosine_similarity( query_embedding.reshape(1, -1), self.features[feature_type] ) return similarities[0] def retrieve_clips(self, query_text, top_k=3): """Retrieve top-k most relevant clips based on query""" # Encode query query_embedding = self.encode_query(query_text) # Compute similarities for different feature types similarities = {} weights = { 'visual_features': 0.4, 'scene_features': 0.3, 'object_features': 0.3 } for feat_type, weight in weights.items(): similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight # Combine similarities combined_similarities = sum(similarities.values()) # Get top-k indices top_indices = np.argsort(combined_similarities)[-top_k:][::-1] # Return clip information results = [] for idx in top_indices: results.append({ 'clip_id': self.clips_df.iloc[idx]['clip_id'], 'movie_title': self.clips_df.iloc[idx]['movie_title'], 'description': self.clips_df.iloc[idx]['description'], 'timestamp': self.clips_df.iloc[idx]['timestamp'], 'youtube_url': self.clips_df.iloc[idx]['youtube_url'], 'similarity_score': float(combined_similarities[idx]) # Convert to float for JSON serialization }) return results def main(): st.set_page_config( page_title="Movie Scene Retrieval System", page_icon="đŸŽŦ", layout="wide" ) st.title("đŸŽŦ Movie Scene Retrieval System") st.write(""" Search for movie scenes using natural language descriptions. The system will retrieve the most relevant 2-3 minute clips based on your query. *Note: This is a demo version using simulated data.* """) # Initialize retrieval system try: retrieval_system = st.session_state.retrieval_system except AttributeError: retrieval_system = VideoRetrieval(use_dummy_data=True) st.session_state.retrieval_system = retrieval_system # Search interface col1, col2 = st.columns([3, 1]) with col1: query = st.text_input( "Enter your scene description:", placeholder="e.g., A dramatic confrontation between two characters in a dark room" ) with col2: num_results = st.slider("Number of results:", min_value=1, max_value=5, value=3) if st.button("🔍 Search", type="primary"): if not query: st.warning("Please enter a scene description.") else: with st.spinner("Searching for relevant clips..."): results = retrieval_system.retrieve_clips(query, top_k=num_results) for i, result in enumerate(results, 1): with st.container(): st.subheader(f"{result['movie_title']}") cols = st.columns([2, 1]) with cols[0]: st.markdown(f"**Scene Description:**") st.write(result['description']) st.text(f"⏱ī¸ Timestamp: {result['timestamp']}") # Add video player if result['youtube_url']: st.video(result['youtube_url']) with cols[1]: st.markdown("**Relevance Score:**") score = min(1.0, max(0.0, result['similarity_score'])) st.progress(score) st.text(f"{score:.2%} match") # Add direct YouTube link st.markdown(f"[🔗 Watch on YouTube]({result['youtube_url']})") st.text("Click to open in a new tab") st.divider() # Sidebar with additional information with st.sidebar: st.header("ℹī¸ About") st.write(""" This demo system simulates a video retrieval engine that uses: - đŸŽĨ Visual scene understanding - đŸ‘Ĩ Character interaction analysis - đŸŽ¯ Object detection - 🎭 Action recognition In a production system, these features would be pre-computed from actual movie clips using state-of-the-art AI models. """) st.header("⚙ī¸ Feature Weights") st.write("Current weights used for similarity computation:") st.write("- đŸŽŦ Visual Features: 40%") st.write("- 🏞ī¸ Scene Features: 30%") st.write("- đŸ“Ļ Object Features: 30%") if __name__ == "__main__": main()