|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import torch |
|
import json |
|
import os |
|
from pathlib import Path |
|
|
|
class VideoRetrieval: |
|
def __init__(self, use_dummy_data=True): |
|
self.text_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
if use_dummy_data: |
|
self.create_dummy_data() |
|
else: |
|
self.load_data() |
|
|
|
def create_dummy_data(self): |
|
"""Create dummy features and metadata for demonstration""" |
|
|
|
n_clips = 20 |
|
feature_dim = 384 |
|
|
|
self.features = { |
|
'visual_features': np.random.randn(n_clips, feature_dim), |
|
'scene_features': np.random.randn(n_clips, feature_dim), |
|
'object_features': np.random.randn(n_clips, feature_dim) |
|
} |
|
|
|
|
|
movie_titles = [ |
|
"The Matrix", "Inception", "The Dark Knight", "Pulp Fiction", |
|
"The Shawshank Redemption", "Forrest Gump", "The Godfather", |
|
"Fight Club", "Interstellar", "The Silence of the Lambs" |
|
] |
|
|
|
descriptions = [ |
|
"A dramatic confrontation in a dark room where the truth is revealed", |
|
"A high-stakes chase through a crowded city street", |
|
"An emotional reunion between long-lost friends", |
|
"A tense negotiation that determines the fate of many", |
|
"A quiet moment of reflection before a life-changing decision" |
|
] |
|
|
|
|
|
youtube_clips = [ |
|
"https://www.youtube.com/watch?v=kcsNbQRU5TI", |
|
"https://www.youtube.com/watch?v=YoHD9XEInc0", |
|
"https://www.youtube.com/watch?v=ZWCAf-xLV2k", |
|
"https://www.youtube.com/watch?v=Jomr9SAjcyw", |
|
"https://www.youtube.com/watch?v=SQ7_5MMbPYs", |
|
] |
|
|
|
data = [] |
|
for i in range(n_clips): |
|
data.append({ |
|
'clip_id': f'clip_{i}', |
|
'movie_title': movie_titles[i % len(movie_titles)], |
|
'description': descriptions[i % len(descriptions)], |
|
'timestamp': f'{(i*5):02d}:00 - {(i*5+3):02d}:00', |
|
'duration': '3:00', |
|
'youtube_url': youtube_clips[i % len(youtube_clips)] |
|
}) |
|
|
|
self.clips_df = pd.DataFrame(data) |
|
|
|
def load_data(self): |
|
"""Load actual pre-computed features and metadata""" |
|
try: |
|
self.features = { |
|
'visual_features': np.load('path_to_visual_features.npy'), |
|
'scene_features': np.load('path_to_scene_features.npy'), |
|
'object_features': np.load('path_to_object_features.npy') |
|
} |
|
self.clips_df = pd.read_csv('clips_metadata.csv') |
|
except FileNotFoundError as e: |
|
st.error(f"Error loading data: {e}. Falling back to dummy data.") |
|
self.create_dummy_data() |
|
|
|
def encode_query(self, query_text): |
|
"""Encode the text query into embeddings""" |
|
return self.text_model.encode(query_text) |
|
|
|
def compute_similarity(self, query_embedding, feature_type='visual_features'): |
|
"""Compute similarity between query and video features""" |
|
similarities = cosine_similarity( |
|
query_embedding.reshape(1, -1), |
|
self.features[feature_type] |
|
) |
|
return similarities[0] |
|
|
|
def retrieve_clips(self, query_text, top_k=3): |
|
"""Retrieve top-k most relevant clips based on query""" |
|
|
|
query_embedding = self.encode_query(query_text) |
|
|
|
|
|
similarities = {} |
|
weights = { |
|
'visual_features': 0.4, |
|
'scene_features': 0.3, |
|
'object_features': 0.3 |
|
} |
|
|
|
for feat_type, weight in weights.items(): |
|
similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight |
|
|
|
|
|
combined_similarities = sum(similarities.values()) |
|
|
|
|
|
top_indices = np.argsort(combined_similarities)[-top_k:][::-1] |
|
|
|
|
|
results = [] |
|
for idx in top_indices: |
|
results.append({ |
|
'clip_id': self.clips_df.iloc[idx]['clip_id'], |
|
'movie_title': self.clips_df.iloc[idx]['movie_title'], |
|
'description': self.clips_df.iloc[idx]['description'], |
|
'timestamp': self.clips_df.iloc[idx]['timestamp'], |
|
'youtube_url': self.clips_df.iloc[idx]['youtube_url'], |
|
'similarity_score': float(combined_similarities[idx]) |
|
}) |
|
|
|
return results |
|
|
|
def main(): |
|
st.set_page_config( |
|
page_title="Movie Scene Retrieval System", |
|
page_icon="π¬", |
|
layout="wide" |
|
) |
|
|
|
st.title("π¬ Movie Scene Retrieval System") |
|
st.write(""" |
|
Search for movie scenes using natural language descriptions. |
|
The system will retrieve the most relevant 2-3 minute clips based on your query. |
|
|
|
*Note: This is a demo version using simulated data.* |
|
""") |
|
|
|
|
|
try: |
|
retrieval_system = st.session_state.retrieval_system |
|
except AttributeError: |
|
retrieval_system = VideoRetrieval(use_dummy_data=True) |
|
st.session_state.retrieval_system = retrieval_system |
|
|
|
|
|
col1, col2 = st.columns([3, 1]) |
|
|
|
with col1: |
|
query = st.text_input( |
|
"Enter your scene description:", |
|
placeholder="e.g., A dramatic confrontation between two characters in a dark room" |
|
) |
|
|
|
with col2: |
|
num_results = st.slider("Number of results:", min_value=1, max_value=5, value=3) |
|
|
|
if st.button("π Search", type="primary"): |
|
if not query: |
|
st.warning("Please enter a scene description.") |
|
else: |
|
with st.spinner("Searching for relevant clips..."): |
|
results = retrieval_system.retrieve_clips(query, top_k=num_results) |
|
|
|
for i, result in enumerate(results, 1): |
|
with st.container(): |
|
st.subheader(f"{result['movie_title']}") |
|
cols = st.columns([2, 1]) |
|
|
|
with cols[0]: |
|
st.markdown(f"**Scene Description:**") |
|
st.write(result['description']) |
|
st.text(f"β±οΈ Timestamp: {result['timestamp']}") |
|
|
|
|
|
if result['youtube_url']: |
|
st.video(result['youtube_url']) |
|
|
|
with cols[1]: |
|
st.markdown("**Relevance Score:**") |
|
score = min(1.0, max(0.0, result['similarity_score'])) |
|
st.progress(score) |
|
st.text(f"{score:.2%} match") |
|
|
|
|
|
st.markdown(f"[π Watch on YouTube]({result['youtube_url']})") |
|
st.text("Click to open in a new tab") |
|
|
|
st.divider() |
|
|
|
|
|
with st.sidebar: |
|
st.header("βΉοΈ About") |
|
st.write(""" |
|
This demo system simulates a video retrieval engine that uses: |
|
|
|
- π₯ Visual scene understanding |
|
- π₯ Character interaction analysis |
|
- π― Object detection |
|
- π Action recognition |
|
|
|
In a production system, these features would be pre-computed |
|
from actual movie clips using state-of-the-art AI models. |
|
""") |
|
|
|
st.header("βοΈ Feature Weights") |
|
st.write("Current weights used for similarity computation:") |
|
st.write("- π¬ Visual Features: 40%") |
|
st.write("- ποΈ Scene Features: 30%") |
|
st.write("- π¦ Object Features: 30%") |
|
|
|
if __name__ == "__main__": |
|
main() |