Spaces:
Sleeping
Sleeping
Commit
·
0981063
1
Parent(s):
ea2149c
initial commit for deployment
Browse files- Dockerfile +1 -1
- README.md +6 -8
- agent.py +78 -0
- app.py +100 -0
- data/processed/movie_embeddings.npz +3 -0
- data/processed/movies_enriched.csv +0 -0
- requirements.txt +11 -3
- scripts/download_movielens.py +17 -0
- scripts/enrich_movies_with_metadata.py +76 -0
- scripts/generate_embeddings.py +42 -0
- scripts/preprocess_movielens.py +23 -0
- scripts/recommender.py +52 -0
- src/streamlit_app.py +0 -40
Dockerfile
CHANGED
|
@@ -18,4 +18,4 @@ EXPOSE 8501
|
|
| 18 |
|
| 19 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 20 |
|
| 21 |
-
ENTRYPOINT ["streamlit", "run", "
|
|
|
|
| 18 |
|
| 19 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 20 |
|
| 21 |
+
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
README.md
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
---
|
| 2 |
title: StreamWiseAI
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
- streamlit
|
| 10 |
pinned: false
|
| 11 |
-
short_description: Streamlit template space
|
| 12 |
license: mit
|
| 13 |
---
|
| 14 |
|
|
|
|
| 1 |
---
|
| 2 |
title: StreamWiseAI
|
| 3 |
+
emoji: 🎬
|
| 4 |
+
colorFrom: "purple"
|
| 5 |
+
colorTo: "green"
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: "1.35.0"
|
| 8 |
+
app_file: app.py
|
|
|
|
| 9 |
pinned: false
|
|
|
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
agent.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
| 9 |
+
|
| 10 |
+
def generate_retention_tip(input_title, recommendations, user_history=None):
|
| 11 |
+
"""
|
| 12 |
+
recommendations: List of dicts with keys - title, genres, overview
|
| 13 |
+
user_history: Optional list of past watched movies
|
| 14 |
+
"""
|
| 15 |
+
if not OPENROUTER_API_KEY:
|
| 16 |
+
raise ValueError("Missing OpenRouter API key. Set OPENROUTER_API_KEY as env variable.")
|
| 17 |
+
|
| 18 |
+
prompt = build_prompt(input_title, recommendations, user_history)
|
| 19 |
+
|
| 20 |
+
headers = {
|
| 21 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 22 |
+
"Content-Type": "application/json",
|
| 23 |
+
"HTTP-Referer": os.getenv("HTTP_REFERER"), # or your repo or site
|
| 24 |
+
"X-Title": "StreamWiseAI Retention Coach"
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
payload = {
|
| 29 |
+
"model": "mistralai/mistral-7b-instruct:free", # Free, fast
|
| 30 |
+
"messages": [
|
| 31 |
+
{"role": "system", "content": "You are a Retention Coach AI who helps users stay engaged by suggesting patterns in what they enjoy."},
|
| 32 |
+
{"role": "user", "content": prompt}
|
| 33 |
+
]
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
@retry(
|
| 37 |
+
stop=stop_after_attempt(3),
|
| 38 |
+
wait=wait_fixed(2),
|
| 39 |
+
retry=retry_if_exception_type((requests.exceptions.RequestException,))
|
| 40 |
+
)
|
| 41 |
+
def call_openrouter():
|
| 42 |
+
response = requests.post(
|
| 43 |
+
"https://openrouter.ai/api/v1/chat/completions",
|
| 44 |
+
headers=headers,
|
| 45 |
+
json=payload,
|
| 46 |
+
timeout=15
|
| 47 |
+
)
|
| 48 |
+
response.raise_for_status()
|
| 49 |
+
return response.json()["choices"][0]["message"]["content"].strip()
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
return call_openrouter()
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print("Retry failed:", e)
|
| 55 |
+
return "⚠️ Unable to generate retention tip right now."
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def build_prompt(input_title, recommendations, user_history=None):
|
| 59 |
+
recs_text = ""
|
| 60 |
+
for rec in recommendations:
|
| 61 |
+
recs_text += f"- Title: {rec['title']}\n Genres: {rec['genres']}\n Overview: {rec['overview'][:200]}...\n"
|
| 62 |
+
|
| 63 |
+
history_text = ""
|
| 64 |
+
if user_history:
|
| 65 |
+
history_text = "Previously liked movies:\n" + "\n".join(f"- {title}" for title in user_history)
|
| 66 |
+
|
| 67 |
+
prompt = f"""
|
| 68 |
+
The user searched for the movie: "{input_title}".
|
| 69 |
+
|
| 70 |
+
Here are the top recommendations:
|
| 71 |
+
{recs_text}
|
| 72 |
+
|
| 73 |
+
{history_text}
|
| 74 |
+
|
| 75 |
+
Based on this, suggest a 1–2 line insight about what the user might enjoy and a content retention tip.
|
| 76 |
+
Only output the tip, no extra text.
|
| 77 |
+
"""
|
| 78 |
+
return prompt.strip()
|
app.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from scripts.recommender import load_data, recommend_movies
|
| 3 |
+
from agent import generate_retention_tip
|
| 4 |
+
|
| 5 |
+
st.set_page_config(page_title="StreamWiseAI", layout="wide")
|
| 6 |
+
st.title("🎬 StreamWiseAI – Personalized Movie Recommender & Retention Coach")
|
| 7 |
+
st.caption("🤖 Powered by AI Agents · 🎯 Smart Search · 🧠 AI Insights")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# Load data
|
| 11 |
+
movies, embeddings = load_data()
|
| 12 |
+
|
| 13 |
+
# Initialize watch history
|
| 14 |
+
if "watch_history" not in st.session_state:
|
| 15 |
+
st.session_state["watch_history"] = []
|
| 16 |
+
|
| 17 |
+
# Search input
|
| 18 |
+
movie_input = st.text_input("Enter a movie you liked", placeholder="e.g. Toy Story")
|
| 19 |
+
show_tip = st.checkbox("💡 Show retention insight from AI coach?", value=True)
|
| 20 |
+
|
| 21 |
+
if movie_input:
|
| 22 |
+
with st.spinner("Finding great recommendations..."):
|
| 23 |
+
recommendations = recommend_movies(movie_input, movies, embeddings)
|
| 24 |
+
|
| 25 |
+
if not recommendations:
|
| 26 |
+
st.error("❌ Movie not found. Please try another title.")
|
| 27 |
+
else:
|
| 28 |
+
st.subheader(f"📽️ Recommendations for **{recommendations['input_title']}**")
|
| 29 |
+
|
| 30 |
+
if recommendations["input_title"] not in st.session_state["watch_history"]:
|
| 31 |
+
st.session_state["watch_history"].append(recommendations["input_title"])
|
| 32 |
+
|
| 33 |
+
cols = st.columns(2)
|
| 34 |
+
|
| 35 |
+
for idx, rec in enumerate(recommendations["results"]):
|
| 36 |
+
with cols[idx % 2]:
|
| 37 |
+
with st.container():
|
| 38 |
+
st.markdown("#### 🎬 " + rec['title'] + f" ({rec['release_year']})")
|
| 39 |
+
|
| 40 |
+
# Fallback-safe image
|
| 41 |
+
if rec['poster_path']:
|
| 42 |
+
st.image(f"https://image.tmdb.org/t/p/w200{rec['poster_path']}", width=150)
|
| 43 |
+
else:
|
| 44 |
+
st.image("https://via.placeholder.com/150x225.png?text=No+Image", width=150)
|
| 45 |
+
|
| 46 |
+
st.markdown(f"**🎭 Genre(s):** {rec['genres']}")
|
| 47 |
+
st.markdown(f"**🧠 Similarity Score:** {rec['similarity']:.2f}")
|
| 48 |
+
|
| 49 |
+
# Truncate overview if too long
|
| 50 |
+
short_overview = rec['overview']
|
| 51 |
+
if len(short_overview) > 250:
|
| 52 |
+
short_overview = short_overview[:250] + "..."
|
| 53 |
+
st.markdown(f"_{short_overview}_")
|
| 54 |
+
|
| 55 |
+
st.markdown("---")
|
| 56 |
+
|
| 57 |
+
if show_tip:
|
| 58 |
+
with st.spinner("🤖 Retention Coach is analyzing your taste..."):
|
| 59 |
+
tip = generate_retention_tip(movie_input, recommendations["results"], st.session_state.get("watch_history", []))
|
| 60 |
+
if tip and not tip.startswith("⚠️"):
|
| 61 |
+
st.markdown("### 💡 Retention Coach Suggests:")
|
| 62 |
+
st.markdown(f"""
|
| 63 |
+
<div style="background-color:#f0f8ff; padding:15px; border-radius:10px; border-left:5px solid #1f77b4;">
|
| 64 |
+
<span style="font-size:16px;">{tip}</span>
|
| 65 |
+
</div>
|
| 66 |
+
""", unsafe_allow_html=True)
|
| 67 |
+
else:
|
| 68 |
+
st.warning("Couldn't generate tip at the moment.")
|
| 69 |
+
|
| 70 |
+
with st.sidebar:
|
| 71 |
+
st.markdown("## 🎬 **About StreamWiseAI**")
|
| 72 |
+
|
| 73 |
+
st.markdown("""
|
| 74 |
+
<span style='color:#6c63ff'><strong>StreamWiseAI</strong></span> is a personalized movie discovery engine designed for modern streaming platforms.
|
| 75 |
+
|
| 76 |
+
Built to impress recruiters and mimic real-world production use cases, it features:
|
| 77 |
+
|
| 78 |
+
🔍 <span style='color:#FFA500'><strong>Semantic Search</strong></span> — understands meaning, not just keywords
|
| 79 |
+
🧠 <span style='color:#00BFFF'><strong>AI Retention Coach</strong></span> — LLM agent gives viewing tips
|
| 80 |
+
🗂️ <span style='color:#32CD32'><strong>Watch History Memory</strong></span> — tracks user session dynamically
|
| 81 |
+
🚀 <span style='color:#FF69B4'><strong>Built for Showcase</strong></span> — Fast, deployable & free
|
| 82 |
+
|
| 83 |
+
---
|
| 84 |
+
|
| 85 |
+
<small><i>Tech stack: Sentence Transformers · Streamlit · OpenRouter LLM API · Fuzzy Matching · Vector Index</i></small>
|
| 86 |
+
""", unsafe_allow_html=True)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
if st.session_state["watch_history"]:
|
| 90 |
+
st.divider()
|
| 91 |
+
with st.expander("📜 Recently Searched"):
|
| 92 |
+
st.markdown("👀 Here’s a list of your recent searches:")
|
| 93 |
+
st.markdown("\n".join(f"- {title}" for title in st.session_state["watch_history"]))
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
st.markdown("---")
|
| 97 |
+
st.markdown(
|
| 98 |
+
"<small>🚀 Built by Rajesh Marudhachalam</small>",
|
| 99 |
+
unsafe_allow_html=True
|
| 100 |
+
)
|
data/processed/movie_embeddings.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bd7e4f70dd34ad2a7663b138e5988cb9a28a090eda8ffb2b9ed725afc9673709
|
| 3 |
+
size 6036450
|
data/processed/movies_enriched.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
CHANGED
|
@@ -1,3 +1,11 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas==2.2.2
|
| 2 |
+
requests==2.31.0
|
| 3 |
+
tqdm==4.66.4
|
| 4 |
+
fuzzywuzzy==0.18.0
|
| 5 |
+
python-Levenshtein==0.12.2
|
| 6 |
+
sentence-transformers==2.7.0
|
| 7 |
+
scikit-learn==1.4.2
|
| 8 |
+
streamlit==1.35.0
|
| 9 |
+
requests==2.31.0
|
| 10 |
+
python-dotenv==1.0.1
|
| 11 |
+
tenacity==8.2.3
|
scripts/download_movielens.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zipfile
|
| 3 |
+
import urllib.request
|
| 4 |
+
|
| 5 |
+
DATA_DIR = "data/raw/movielens"
|
| 6 |
+
URL = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
|
| 7 |
+
ZIP_PATH = "data/raw/movielens/ml-1m.zip"
|
| 8 |
+
|
| 9 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
| 10 |
+
|
| 11 |
+
print("Downloading MovieLens 1M...")
|
| 12 |
+
urllib.request.urlretrieve(URL, ZIP_PATH)
|
| 13 |
+
|
| 14 |
+
with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
|
| 15 |
+
zip_ref.extractall(DATA_DIR)
|
| 16 |
+
|
| 17 |
+
print("✅ Download complete. Extracted to:", DATA_DIR)
|
scripts/enrich_movies_with_metadata.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from fuzzywuzzy import fuzz
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
import ast
|
| 5 |
+
|
| 6 |
+
ml_path = "data/processed/movies.csv"
|
| 7 |
+
tmdb_path = "data/raw/tmdb/movies_metadata.csv"
|
| 8 |
+
out_path = "data/processed/movies_enriched.csv"
|
| 9 |
+
|
| 10 |
+
tqdm.pandas()
|
| 11 |
+
|
| 12 |
+
# Load MovieLens
|
| 13 |
+
ml = pd.read_csv(ml_path)
|
| 14 |
+
ml["CleanTitle"] = ml["Title"].str.extract(r"^(.*)\s\(\d{4}\)", expand=False).str.strip()
|
| 15 |
+
ml.dropna(subset=['CleanTitle'], inplace=True)
|
| 16 |
+
ml['Year'] = ml['Title'].str.extract(r"\((\d{4})\)", expand=False)
|
| 17 |
+
ml['Genres'] = ml['Genres'].str.replace('|', ', ', regex=False)
|
| 18 |
+
ml['CleanTitle'] = ml['CleanTitle'].str.replace(r"\s*\(.*?\)", "", regex=True).str.strip()
|
| 19 |
+
|
| 20 |
+
# Load TMDb metadata
|
| 21 |
+
tmdb = pd.read_csv(tmdb_path, low_memory=False)
|
| 22 |
+
tmdb = tmdb.dropna(subset=["title", "overview"])
|
| 23 |
+
tmdb["title_clean"] = tmdb["title"].str.lower().str.strip()
|
| 24 |
+
tmdb['release_year'] = tmdb['release_date'].str[:4]
|
| 25 |
+
tmdb['genres'] = tmdb['genres'].apply(
|
| 26 |
+
lambda x: ', '.join(d['name'] for d in ast.literal_eval(x) if 'name' in d)
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# Function to find best fuzzy match
|
| 30 |
+
def get_best_match(title, year):
|
| 31 |
+
choices = tmdb["title_clean"][tmdb["release_year"].astype(str)==year].tolist()
|
| 32 |
+
scores = [(choice, fuzz.token_sort_ratio(str(title).lower(), choice)) for choice in choices]
|
| 33 |
+
best = max(scores, key=lambda x: x[1])
|
| 34 |
+
return best if best[1] > 80 else (None, 0) # threshold
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# Apply fuzzy matching
|
| 38 |
+
matches = ml[["CleanTitle", "Year"]].progress_apply(
|
| 39 |
+
lambda x: get_best_match(
|
| 40 |
+
x['CleanTitle'],
|
| 41 |
+
x['Year']
|
| 42 |
+
),
|
| 43 |
+
axis=1)
|
| 44 |
+
ml["matched_title"] = matches.apply(lambda x: x[0])
|
| 45 |
+
ml["match_score"] = matches.apply(lambda x: x[1])
|
| 46 |
+
ml.dropna(subset=['matched_title'], inplace=True)
|
| 47 |
+
|
| 48 |
+
# Merge on matched title
|
| 49 |
+
merged = ml.merge(tmdb, left_on="matched_title", right_on="title_clean", how="left")
|
| 50 |
+
|
| 51 |
+
def merge_unique_genres(col1, col2):
|
| 52 |
+
# Split by comma and strip whitespace
|
| 53 |
+
list1 = [x.strip() for x in col1.split(',')] if pd.notna(col1) else []
|
| 54 |
+
list2 = [x.strip() for x in col2.split(',')] if pd.notna(col2) else []
|
| 55 |
+
|
| 56 |
+
# Combine while preserving order and removing duplicates
|
| 57 |
+
seen = set()
|
| 58 |
+
merged = []
|
| 59 |
+
for item in list1 + list2:
|
| 60 |
+
if item not in seen:
|
| 61 |
+
seen.add(item)
|
| 62 |
+
merged.append(item)
|
| 63 |
+
return ', '.join(merged)
|
| 64 |
+
|
| 65 |
+
merged['genres'] = merged.apply(lambda row: merge_unique_genres(row['Genres'], row['genres']), axis=1)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# Keep relevant columns
|
| 69 |
+
keep_cols = [
|
| 70 |
+
"MovieID", "Title", "Genres", "CleanTitle",
|
| 71 |
+
"overview", "genres", "release_date", "release_year", "poster_path", "matched_title", "match_score"
|
| 72 |
+
]
|
| 73 |
+
final = merged[keep_cols]
|
| 74 |
+
final.to_csv(out_path, index=False)
|
| 75 |
+
|
| 76 |
+
print("✅ Enriched metadata saved to:", out_path)
|
scripts/generate_embeddings.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 4 |
+
import numpy as np
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
tqdm.pandas()
|
| 8 |
+
|
| 9 |
+
DATA_PATH = "data/processed/movies_enriched.csv"
|
| 10 |
+
OUTPUT_PATH = "data/processed/movie_embeddings.npz"
|
| 11 |
+
|
| 12 |
+
print("🔍 Loading movie metadata...")
|
| 13 |
+
df = pd.read_csv(DATA_PATH)
|
| 14 |
+
df = df.dropna(subset=["overview"])
|
| 15 |
+
# df["release_year"] = pd.to_datetime(df["release_date"], errors='coerce').dt.year
|
| 16 |
+
print(f"✅ Loaded {len(df)} movies with valid overviews.")
|
| 17 |
+
|
| 18 |
+
# Load Sentence-BERT model
|
| 19 |
+
print("🧠 Loading Sentence-BERT model (all-MiniLM-L6-v2)...")
|
| 20 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 21 |
+
|
| 22 |
+
# Generate embeddings with progress bar
|
| 23 |
+
print("⚙️ Generating semantic embeddings...")
|
| 24 |
+
def build_embedding_text(row):
|
| 25 |
+
return f"{row['CleanTitle']} ({row['release_year']}) — {row['genres']}. {row['overview']}"
|
| 26 |
+
|
| 27 |
+
texts = df.apply(build_embedding_text, axis=1).tolist()
|
| 28 |
+
|
| 29 |
+
embeddings = []
|
| 30 |
+
for emb in tqdm(model.encode(texts, batch_size=32, show_progress_bar=False), total=len(texts), desc="📈 Encoding"):
|
| 31 |
+
embeddings.append(emb)
|
| 32 |
+
|
| 33 |
+
embeddings = np.array(embeddings)
|
| 34 |
+
|
| 35 |
+
# Save embeddings and metadata
|
| 36 |
+
print("💾 Saving embeddings and metadata...")
|
| 37 |
+
np.savez_compressed(OUTPUT_PATH,
|
| 38 |
+
embeddings=embeddings,
|
| 39 |
+
titles=df["CleanTitle"].tolist(),
|
| 40 |
+
movie_ids=df["MovieID"].tolist())
|
| 41 |
+
|
| 42 |
+
print("✅ Done! Embeddings saved to:", OUTPUT_PATH)
|
scripts/preprocess_movielens.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
RAW_DIR = "data/raw/movielens/ml-1m"
|
| 5 |
+
PROCESSED_DIR = "data/processed"
|
| 6 |
+
os.makedirs(PROCESSED_DIR, exist_ok=True)
|
| 7 |
+
|
| 8 |
+
# Read .dat files using correct encoding and separator
|
| 9 |
+
users = pd.read_csv(f"{RAW_DIR}/users.dat", sep="::", engine="python", encoding="latin-1",
|
| 10 |
+
names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
|
| 11 |
+
|
| 12 |
+
movies = pd.read_csv(f"{RAW_DIR}/movies.dat", sep="::", engine="python", encoding="latin-1",
|
| 13 |
+
names=["MovieID", "Title", "Genres"])
|
| 14 |
+
|
| 15 |
+
ratings = pd.read_csv(f"{RAW_DIR}/ratings.dat", sep="::", engine="python", encoding="latin-1",
|
| 16 |
+
names=["UserID", "MovieID", "Rating", "Timestamp"])
|
| 17 |
+
|
| 18 |
+
# Save cleaned CSVs
|
| 19 |
+
users.to_csv(f"{PROCESSED_DIR}/users.csv", index=False)
|
| 20 |
+
movies.to_csv(f"{PROCESSED_DIR}/movies.csv", index=False)
|
| 21 |
+
ratings.to_csv(f"{PROCESSED_DIR}/ratings.csv", index=False)
|
| 22 |
+
|
| 23 |
+
print("✅ Preprocessing complete. Cleaned CSVs saved to:", PROCESSED_DIR)
|
scripts/recommender.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import difflib
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sentence_transformers import SentenceTransformer, util
|
| 5 |
+
|
| 6 |
+
def load_data():
|
| 7 |
+
movies = pd.read_csv("data/processed/movies_enriched.csv")
|
| 8 |
+
data = np.load("data/processed/movie_embeddings.npz")
|
| 9 |
+
embeddings = data["embeddings"]
|
| 10 |
+
return movies, embeddings
|
| 11 |
+
|
| 12 |
+
def recommend_movies(movie_title, movies_df, embeddings, top_k=5):
|
| 13 |
+
if not isinstance(movie_title, str):
|
| 14 |
+
movie_title = str(movie_title).strip().lower()
|
| 15 |
+
movie_title = movie_title.strip().lower()
|
| 16 |
+
|
| 17 |
+
# Title matching
|
| 18 |
+
all_titles = movies_df["matched_title"].fillna("").astype(str).tolist()
|
| 19 |
+
match = difflib.get_close_matches(movie_title, all_titles, n=1, cutoff=0.6)
|
| 20 |
+
|
| 21 |
+
if not match:
|
| 22 |
+
return None
|
| 23 |
+
|
| 24 |
+
matched_title = match[0]
|
| 25 |
+
idx = movies_df[movies_df["matched_title"] == matched_title].index[0]
|
| 26 |
+
|
| 27 |
+
# Now instead of comparing embeddings[idx] vs others (which may be weak),
|
| 28 |
+
# encode the *user input* itself
|
| 29 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 30 |
+
query_text = movie_title
|
| 31 |
+
query_vec = model.encode(query_text, convert_to_tensor=True)
|
| 32 |
+
|
| 33 |
+
scores = util.cos_sim(query_vec, embeddings)[0].cpu().numpy()
|
| 34 |
+
top_indices = scores.argsort()[::-1][:top_k]
|
| 35 |
+
|
| 36 |
+
results = []
|
| 37 |
+
for i in top_indices:
|
| 38 |
+
row = movies_df.iloc[i]
|
| 39 |
+
poster_url = f"https://image.tmdb.org/t/p/w500{row['poster_path']}" if pd.notna(row["poster_path"]) else None
|
| 40 |
+
results.append({
|
| 41 |
+
"title": row["CleanTitle"],
|
| 42 |
+
"genres": row['genres'],
|
| 43 |
+
"overview": row["overview"],
|
| 44 |
+
"poster_path": poster_url,
|
| 45 |
+
"release_year": row["release_date"][:4] if pd.notna(row["release_date"]) else "Unknown",
|
| 46 |
+
"similarity": float(scores[i])
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
return {
|
| 50 |
+
"input_title": movies_df.iloc[idx]["Title"],
|
| 51 |
+
"results": results
|
| 52 |
+
}
|
src/streamlit_app.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import streamlit as st
|
| 5 |
-
|
| 6 |
-
"""
|
| 7 |
-
# Welcome to Streamlit!
|
| 8 |
-
|
| 9 |
-
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
-
forums](https://discuss.streamlit.io).
|
| 12 |
-
|
| 13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
-
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|