Spaces:

rajesh1804
/

StreamWiseAI

Sleeping

App Files Files Community

rajesh1804 commited on Jul 18

Commit

0981063

1 Parent(s): ea2149c

initial commit for deployment

Browse files

Files changed (13) hide show

Dockerfile +1 -1
README.md +6 -8
agent.py +78 -0
app.py +100 -0
data/processed/movie_embeddings.npz +3 -0
data/processed/movies_enriched.csv +0 -0
requirements.txt +11 -3
scripts/download_movielens.py +17 -0
scripts/enrich_movies_with_metadata.py +76 -0
scripts/generate_embeddings.py +42 -0
scripts/preprocess_movielens.py +23 -0
scripts/recommender.py +52 -0
src/streamlit_app.py +0 -40

Dockerfile CHANGED Viewed

@@ -18,4 +18,4 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]


18
19	HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
21	+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,14 +1,12 @@
 ---
 title: StreamWiseAI
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
 pinned: false
-short_description: Streamlit template space
 license: mit
 ---

 ---
 title: StreamWiseAI
+emoji: 🎬
+colorFrom: "purple"
+colorTo: "green"
+sdk: streamlit
+sdk_version: "1.35.0"
+app_file: app.py
 pinned: false
 license: mit
 ---

agent.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import requests
+from dotenv import load_dotenv
+from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
+load_dotenv()
+OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+def generate_retention_tip(input_title, recommendations, user_history=None):
+    """
+    recommendations: List of dicts with keys - title, genres, overview
+    user_history: Optional list of past watched movies
+    """
+    if not OPENROUTER_API_KEY:
+        raise ValueError("Missing OpenRouter API key. Set OPENROUTER_API_KEY as env variable.")
+    prompt = build_prompt(input_title, recommendations, user_history)
+    headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "Content-Type": "application/json",
+        "HTTP-Referer": os.getenv("HTTP_REFERER"),  # or your repo or site
+        "X-Title": "StreamWiseAI Retention Coach"
+    }
+    payload = {
+        "model": "mistralai/mistral-7b-instruct:free",  # Free, fast
+        "messages": [
+            {"role": "system", "content": "You are a Retention Coach AI who helps users stay engaged by suggesting patterns in what they enjoy."},
+            {"role": "user", "content": prompt}
+        ]
+    }
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_fixed(2),
+        retry=retry_if_exception_type((requests.exceptions.RequestException,))
+    )
+    def call_openrouter():
+        response = requests.post(
+            "https://openrouter.ai/api/v1/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=15
+        )
+        response.raise_for_status()
+        return response.json()["choices"][0]["message"]["content"].strip()
+    try:
+        return call_openrouter()
+    except Exception as e:
+        print("Retry failed:", e)
+        return "⚠️ Unable to generate retention tip right now."
+def build_prompt(input_title, recommendations, user_history=None):
+    recs_text = ""
+    for rec in recommendations:
+        recs_text += f"- Title: {rec['title']}\n  Genres: {rec['genres']}\n  Overview: {rec['overview'][:200]}...\n"
+    history_text = ""
+    if user_history:
+        history_text = "Previously liked movies:\n" + "\n".join(f"- {title}" for title in user_history)
+    prompt = f"""
+The user searched for the movie: "{input_title}".
+Here are the top recommendations:
+{recs_text}
+{history_text}
+Based on this, suggest a 1–2 line insight about what the user might enjoy and a content retention tip.
+Only output the tip, no extra text.
+"""
+    return prompt.strip()

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import streamlit as st
+from scripts.recommender import load_data, recommend_movies
+from agent import generate_retention_tip
+st.set_page_config(page_title="StreamWiseAI", layout="wide")
+st.title("🎬 StreamWiseAI – Personalized Movie Recommender & Retention Coach")
+st.caption("🤖 Powered by AI Agents · 🎯 Smart Search · 🧠 AI Insights")
+# Load data
+movies, embeddings = load_data()
+# Initialize watch history
+if "watch_history" not in st.session_state:
+    st.session_state["watch_history"] = []
+# Search input
+movie_input = st.text_input("Enter a movie you liked", placeholder="e.g. Toy Story")
+show_tip = st.checkbox("💡 Show retention insight from AI coach?", value=True)
+if movie_input:
+    with st.spinner("Finding great recommendations..."):
+        recommendations = recommend_movies(movie_input, movies, embeddings)
+        if not recommendations:
+            st.error("❌ Movie not found. Please try another title.")
+        else:
+            st.subheader(f"📽️ Recommendations for **{recommendations['input_title']}**")
+            if recommendations["input_title"] not in st.session_state["watch_history"]:
+                st.session_state["watch_history"].append(recommendations["input_title"])
+            cols = st.columns(2)
+            for idx, rec in enumerate(recommendations["results"]):
+                with cols[idx % 2]:
+                    with st.container():
+                        st.markdown("#### 🎬 " + rec['title'] + f" ({rec['release_year']})")
+                        # Fallback-safe image
+                        if rec['poster_path']:
+                            st.image(f"https://image.tmdb.org/t/p/w200{rec['poster_path']}", width=150)
+                        else:
+                            st.image("https://via.placeholder.com/150x225.png?text=No+Image", width=150)
+                        st.markdown(f"**🎭 Genre(s):** {rec['genres']}")
+                        st.markdown(f"**🧠 Similarity Score:** {rec['similarity']:.2f}")
+                        # Truncate overview if too long
+                        short_overview = rec['overview']
+                        if len(short_overview) > 250:
+                            short_overview = short_overview[:250] + "..."
+                        st.markdown(f"_{short_overview}_")
+                        st.markdown("---")
+            if show_tip:
+                with st.spinner("🤖 Retention Coach is analyzing your taste..."):
+                    tip = generate_retention_tip(movie_input, recommendations["results"], st.session_state.get("watch_history", []))
+                    if tip and not tip.startswith("⚠️"):
+                        st.markdown("### 💡 Retention Coach Suggests:")
+                        st.markdown(f"""
+                        <div style="background-color:#f0f8ff; padding:15px; border-radius:10px; border-left:5px solid #1f77b4;">
+                            <span style="font-size:16px;">{tip}</span>
+                        </div>
+                        """, unsafe_allow_html=True)
+                    else:
+                        st.warning("Couldn't generate tip at the moment.")
+with st.sidebar:
+    st.markdown("## 🎬 **About StreamWiseAI**")
+    st.markdown("""
+<span style='color:#6c63ff'><strong>StreamWiseAI</strong></span> is a personalized movie discovery engine designed for modern streaming platforms.
+Built to impress recruiters and mimic real-world production use cases, it features:
+🔍 <span style='color:#FFA500'><strong>Semantic Search</strong></span> — understands meaning, not just keywords
+🧠 <span style='color:#00BFFF'><strong>AI Retention Coach</strong></span> — LLM agent gives viewing tips
+🗂️ <span style='color:#32CD32'><strong>Watch History Memory</strong></span> — tracks user session dynamically
+🚀 <span style='color:#FF69B4'><strong>Built for Showcase</strong></span> — Fast, deployable & free
+---
+<small><i>Tech stack: Sentence Transformers · Streamlit · OpenRouter LLM API · Fuzzy Matching · Vector Index</i></small>
+""", unsafe_allow_html=True)
+if st.session_state["watch_history"]:
+    st.divider()
+    with st.expander("📜 Recently Searched"):
+        st.markdown("👀 Here’s a list of your recent searches:")
+        st.markdown("\n".join(f"- {title}" for title in st.session_state["watch_history"]))
+st.markdown("---")
+st.markdown(
+    "<small>🚀 Built by Rajesh Marudhachalam</small>",
+    unsafe_allow_html=True
+)

data/processed/movie_embeddings.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd7e4f70dd34ad2a7663b138e5988cb9a28a090eda8ffb2b9ed725afc9673709
+size 6036450

data/processed/movies_enriched.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,3 +1,11 @@
-altair
-pandas
-streamlit

+pandas==2.2.2
+requests==2.31.0
+tqdm==4.66.4
+fuzzywuzzy==0.18.0
+python-Levenshtein==0.12.2
+sentence-transformers==2.7.0
+scikit-learn==1.4.2
+streamlit==1.35.0
+requests==2.31.0
+python-dotenv==1.0.1
+tenacity==8.2.3

scripts/download_movielens.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+import zipfile
+import urllib.request
+DATA_DIR = "data/raw/movielens"
+URL = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
+ZIP_PATH = "data/raw/movielens/ml-1m.zip"
+os.makedirs(DATA_DIR, exist_ok=True)
+print("Downloading MovieLens 1M...")
+urllib.request.urlretrieve(URL, ZIP_PATH)
+with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
+    zip_ref.extractall(DATA_DIR)
+print("✅ Download complete. Extracted to:", DATA_DIR)

scripts/enrich_movies_with_metadata.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import pandas as pd
+from fuzzywuzzy import fuzz
+from tqdm import tqdm
+import ast
+ml_path = "data/processed/movies.csv"
+tmdb_path = "data/raw/tmdb/movies_metadata.csv"
+out_path = "data/processed/movies_enriched.csv"
+tqdm.pandas()
+# Load MovieLens
+ml = pd.read_csv(ml_path)
+ml["CleanTitle"] = ml["Title"].str.extract(r"^(.*)\s\(\d{4}\)", expand=False).str.strip()
+ml.dropna(subset=['CleanTitle'], inplace=True)
+ml['Year'] = ml['Title'].str.extract(r"\((\d{4})\)", expand=False)
+ml['Genres'] = ml['Genres'].str.replace('|', ', ', regex=False)
+ml['CleanTitle'] = ml['CleanTitle'].str.replace(r"\s*\(.*?\)", "", regex=True).str.strip()
+# Load TMDb metadata
+tmdb = pd.read_csv(tmdb_path, low_memory=False)
+tmdb = tmdb.dropna(subset=["title", "overview"])
+tmdb["title_clean"] = tmdb["title"].str.lower().str.strip()
+tmdb['release_year'] = tmdb['release_date'].str[:4]
+tmdb['genres'] = tmdb['genres'].apply(
+    lambda x: ', '.join(d['name'] for d in ast.literal_eval(x) if 'name' in d)
+)
+# Function to find best fuzzy match
+def get_best_match(title, year):
+    choices = tmdb["title_clean"][tmdb["release_year"].astype(str)==year].tolist()
+    scores = [(choice, fuzz.token_sort_ratio(str(title).lower(), choice)) for choice in choices]
+    best = max(scores, key=lambda x: x[1])
+    return best if best[1] > 80 else (None, 0)  # threshold
+# Apply fuzzy matching
+matches = ml[["CleanTitle", "Year"]].progress_apply(
+    lambda x: get_best_match(
+        x['CleanTitle'],
+        x['Year']
+        ),
+    axis=1)
+ml["matched_title"] = matches.apply(lambda x: x[0])
+ml["match_score"] = matches.apply(lambda x: x[1])
+ml.dropna(subset=['matched_title'], inplace=True)
+# Merge on matched title
+merged = ml.merge(tmdb, left_on="matched_title", right_on="title_clean", how="left")
+def merge_unique_genres(col1, col2):
+    # Split by comma and strip whitespace
+    list1 = [x.strip() for x in col1.split(',')] if pd.notna(col1) else []
+    list2 = [x.strip() for x in col2.split(',')] if pd.notna(col2) else []
+    # Combine while preserving order and removing duplicates
+    seen = set()
+    merged = []
+    for item in list1 + list2:
+        if item not in seen:
+            seen.add(item)
+            merged.append(item)
+    return ', '.join(merged)
+merged['genres'] = merged.apply(lambda row: merge_unique_genres(row['Genres'], row['genres']), axis=1)
+# Keep relevant columns
+keep_cols = [
+    "MovieID", "Title", "Genres", "CleanTitle",
+    "overview", "genres", "release_date", "release_year", "poster_path", "matched_title", "match_score"
+]
+final = merged[keep_cols]
+final.to_csv(out_path, index=False)
+print("✅ Enriched metadata saved to:", out_path)

scripts/generate_embeddings.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from tqdm import tqdm
+tqdm.pandas()
+DATA_PATH = "data/processed/movies_enriched.csv"
+OUTPUT_PATH = "data/processed/movie_embeddings.npz"
+print("🔍 Loading movie metadata...")
+df = pd.read_csv(DATA_PATH)
+df = df.dropna(subset=["overview"])
+# df["release_year"] = pd.to_datetime(df["release_date"], errors='coerce').dt.year
+print(f"✅ Loaded {len(df)} movies with valid overviews.")
+# Load Sentence-BERT model
+print("🧠 Loading Sentence-BERT model (all-MiniLM-L6-v2)...")
+model = SentenceTransformer("all-MiniLM-L6-v2")
+# Generate embeddings with progress bar
+print("⚙️ Generating semantic embeddings...")
+def build_embedding_text(row):
+    return f"{row['CleanTitle']} ({row['release_year']}) — {row['genres']}. {row['overview']}"
+texts = df.apply(build_embedding_text, axis=1).tolist()
+embeddings = []
+for emb in tqdm(model.encode(texts, batch_size=32, show_progress_bar=False), total=len(texts), desc="📈 Encoding"):
+    embeddings.append(emb)
+embeddings = np.array(embeddings)
+# Save embeddings and metadata
+print("💾 Saving embeddings and metadata...")
+np.savez_compressed(OUTPUT_PATH,
+                    embeddings=embeddings,
+                    titles=df["CleanTitle"].tolist(),
+                    movie_ids=df["MovieID"].tolist())
+print("✅ Done! Embeddings saved to:", OUTPUT_PATH)

scripts/preprocess_movielens.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import pandas as pd
+import os
+RAW_DIR = "data/raw/movielens/ml-1m"
+PROCESSED_DIR = "data/processed"
+os.makedirs(PROCESSED_DIR, exist_ok=True)
+# Read .dat files using correct encoding and separator
+users = pd.read_csv(f"{RAW_DIR}/users.dat", sep="::", engine="python", encoding="latin-1",
+                    names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
+movies = pd.read_csv(f"{RAW_DIR}/movies.dat", sep="::", engine="python", encoding="latin-1",
+                     names=["MovieID", "Title", "Genres"])
+ratings = pd.read_csv(f"{RAW_DIR}/ratings.dat", sep="::", engine="python", encoding="latin-1",
+                      names=["UserID", "MovieID", "Rating", "Timestamp"])
+# Save cleaned CSVs
+users.to_csv(f"{PROCESSED_DIR}/users.csv", index=False)
+movies.to_csv(f"{PROCESSED_DIR}/movies.csv", index=False)
+ratings.to_csv(f"{PROCESSED_DIR}/ratings.csv", index=False)
+print("✅ Preprocessing complete. Cleaned CSVs saved to:", PROCESSED_DIR)

scripts/recommender.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import difflib
+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+def load_data():
+    movies = pd.read_csv("data/processed/movies_enriched.csv")
+    data = np.load("data/processed/movie_embeddings.npz")
+    embeddings = data["embeddings"]
+    return movies, embeddings
+def recommend_movies(movie_title, movies_df, embeddings, top_k=5):
+    if not isinstance(movie_title, str):
+        movie_title = str(movie_title).strip().lower()
+    movie_title = movie_title.strip().lower()
+    # Title matching
+    all_titles = movies_df["matched_title"].fillna("").astype(str).tolist()
+    match = difflib.get_close_matches(movie_title, all_titles, n=1, cutoff=0.6)
+    if not match:
+        return None
+    matched_title = match[0]
+    idx = movies_df[movies_df["matched_title"] == matched_title].index[0]
+    # Now instead of comparing embeddings[idx] vs others (which may be weak),
+    # encode the *user input* itself
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    query_text = movie_title
+    query_vec = model.encode(query_text, convert_to_tensor=True)
+    scores = util.cos_sim(query_vec, embeddings)[0].cpu().numpy()
+    top_indices = scores.argsort()[::-1][:top_k]
+    results = []
+    for i in top_indices:
+        row = movies_df.iloc[i]
+        poster_url = f"https://image.tmdb.org/t/p/w500{row['poster_path']}" if pd.notna(row["poster_path"]) else None
+        results.append({
+            "title": row["CleanTitle"],
+            "genres": row['genres'],
+            "overview": row["overview"],
+            "poster_path": poster_url,
+            "release_year": row["release_date"][:4] if pd.notna(row["release_date"]) else "Unknown",
+            "similarity": float(scores[i])
+        })
+    return {
+        "input_title": movies_df.iloc[idx]["Title"],
+        "results": results
+    }

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))