rajesh1804 commited on
Commit
0981063
·
1 Parent(s): ea2149c

initial commit for deployment

Browse files
Dockerfile CHANGED
@@ -18,4 +18,4 @@ EXPOSE 8501
18
 
19
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
 
21
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
18
 
19
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
 
21
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md CHANGED
@@ -1,14 +1,12 @@
1
  ---
2
  title: StreamWiseAI
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
  pinned: false
11
- short_description: Streamlit template space
12
  license: mit
13
  ---
14
 
 
1
  ---
2
  title: StreamWiseAI
3
+ emoji: 🎬
4
+ colorFrom: "purple"
5
+ colorTo: "green"
6
+ sdk: streamlit
7
+ sdk_version: "1.35.0"
8
+ app_file: app.py
 
9
  pinned: false
 
10
  license: mit
11
  ---
12
 
agent.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from dotenv import load_dotenv
4
+ from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
5
+
6
+ load_dotenv()
7
+
8
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
9
+
10
+ def generate_retention_tip(input_title, recommendations, user_history=None):
11
+ """
12
+ recommendations: List of dicts with keys - title, genres, overview
13
+ user_history: Optional list of past watched movies
14
+ """
15
+ if not OPENROUTER_API_KEY:
16
+ raise ValueError("Missing OpenRouter API key. Set OPENROUTER_API_KEY as env variable.")
17
+
18
+ prompt = build_prompt(input_title, recommendations, user_history)
19
+
20
+ headers = {
21
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
22
+ "Content-Type": "application/json",
23
+ "HTTP-Referer": os.getenv("HTTP_REFERER"), # or your repo or site
24
+ "X-Title": "StreamWiseAI Retention Coach"
25
+ }
26
+
27
+
28
+ payload = {
29
+ "model": "mistralai/mistral-7b-instruct:free", # Free, fast
30
+ "messages": [
31
+ {"role": "system", "content": "You are a Retention Coach AI who helps users stay engaged by suggesting patterns in what they enjoy."},
32
+ {"role": "user", "content": prompt}
33
+ ]
34
+ }
35
+
36
+ @retry(
37
+ stop=stop_after_attempt(3),
38
+ wait=wait_fixed(2),
39
+ retry=retry_if_exception_type((requests.exceptions.RequestException,))
40
+ )
41
+ def call_openrouter():
42
+ response = requests.post(
43
+ "https://openrouter.ai/api/v1/chat/completions",
44
+ headers=headers,
45
+ json=payload,
46
+ timeout=15
47
+ )
48
+ response.raise_for_status()
49
+ return response.json()["choices"][0]["message"]["content"].strip()
50
+
51
+ try:
52
+ return call_openrouter()
53
+ except Exception as e:
54
+ print("Retry failed:", e)
55
+ return "⚠️ Unable to generate retention tip right now."
56
+
57
+
58
+ def build_prompt(input_title, recommendations, user_history=None):
59
+ recs_text = ""
60
+ for rec in recommendations:
61
+ recs_text += f"- Title: {rec['title']}\n Genres: {rec['genres']}\n Overview: {rec['overview'][:200]}...\n"
62
+
63
+ history_text = ""
64
+ if user_history:
65
+ history_text = "Previously liked movies:\n" + "\n".join(f"- {title}" for title in user_history)
66
+
67
+ prompt = f"""
68
+ The user searched for the movie: "{input_title}".
69
+
70
+ Here are the top recommendations:
71
+ {recs_text}
72
+
73
+ {history_text}
74
+
75
+ Based on this, suggest a 1–2 line insight about what the user might enjoy and a content retention tip.
76
+ Only output the tip, no extra text.
77
+ """
78
+ return prompt.strip()
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from scripts.recommender import load_data, recommend_movies
3
+ from agent import generate_retention_tip
4
+
5
+ st.set_page_config(page_title="StreamWiseAI", layout="wide")
6
+ st.title("🎬 StreamWiseAI – Personalized Movie Recommender & Retention Coach")
7
+ st.caption("🤖 Powered by AI Agents · 🎯 Smart Search · 🧠 AI Insights")
8
+
9
+
10
+ # Load data
11
+ movies, embeddings = load_data()
12
+
13
+ # Initialize watch history
14
+ if "watch_history" not in st.session_state:
15
+ st.session_state["watch_history"] = []
16
+
17
+ # Search input
18
+ movie_input = st.text_input("Enter a movie you liked", placeholder="e.g. Toy Story")
19
+ show_tip = st.checkbox("💡 Show retention insight from AI coach?", value=True)
20
+
21
+ if movie_input:
22
+ with st.spinner("Finding great recommendations..."):
23
+ recommendations = recommend_movies(movie_input, movies, embeddings)
24
+
25
+ if not recommendations:
26
+ st.error("❌ Movie not found. Please try another title.")
27
+ else:
28
+ st.subheader(f"📽️ Recommendations for **{recommendations['input_title']}**")
29
+
30
+ if recommendations["input_title"] not in st.session_state["watch_history"]:
31
+ st.session_state["watch_history"].append(recommendations["input_title"])
32
+
33
+ cols = st.columns(2)
34
+
35
+ for idx, rec in enumerate(recommendations["results"]):
36
+ with cols[idx % 2]:
37
+ with st.container():
38
+ st.markdown("#### 🎬 " + rec['title'] + f" ({rec['release_year']})")
39
+
40
+ # Fallback-safe image
41
+ if rec['poster_path']:
42
+ st.image(f"https://image.tmdb.org/t/p/w200{rec['poster_path']}", width=150)
43
+ else:
44
+ st.image("https://via.placeholder.com/150x225.png?text=No+Image", width=150)
45
+
46
+ st.markdown(f"**🎭 Genre(s):** {rec['genres']}")
47
+ st.markdown(f"**🧠 Similarity Score:** {rec['similarity']:.2f}")
48
+
49
+ # Truncate overview if too long
50
+ short_overview = rec['overview']
51
+ if len(short_overview) > 250:
52
+ short_overview = short_overview[:250] + "..."
53
+ st.markdown(f"_{short_overview}_")
54
+
55
+ st.markdown("---")
56
+
57
+ if show_tip:
58
+ with st.spinner("🤖 Retention Coach is analyzing your taste..."):
59
+ tip = generate_retention_tip(movie_input, recommendations["results"], st.session_state.get("watch_history", []))
60
+ if tip and not tip.startswith("⚠️"):
61
+ st.markdown("### 💡 Retention Coach Suggests:")
62
+ st.markdown(f"""
63
+ <div style="background-color:#f0f8ff; padding:15px; border-radius:10px; border-left:5px solid #1f77b4;">
64
+ <span style="font-size:16px;">{tip}</span>
65
+ </div>
66
+ """, unsafe_allow_html=True)
67
+ else:
68
+ st.warning("Couldn't generate tip at the moment.")
69
+
70
+ with st.sidebar:
71
+ st.markdown("## 🎬 **About StreamWiseAI**")
72
+
73
+ st.markdown("""
74
+ <span style='color:#6c63ff'><strong>StreamWiseAI</strong></span> is a personalized movie discovery engine designed for modern streaming platforms.
75
+
76
+ Built to impress recruiters and mimic real-world production use cases, it features:
77
+
78
+ 🔍 <span style='color:#FFA500'><strong>Semantic Search</strong></span> — understands meaning, not just keywords
79
+ 🧠 <span style='color:#00BFFF'><strong>AI Retention Coach</strong></span> — LLM agent gives viewing tips
80
+ 🗂️ <span style='color:#32CD32'><strong>Watch History Memory</strong></span> — tracks user session dynamically
81
+ 🚀 <span style='color:#FF69B4'><strong>Built for Showcase</strong></span> — Fast, deployable & free
82
+
83
+ ---
84
+
85
+ <small><i>Tech stack: Sentence Transformers · Streamlit · OpenRouter LLM API · Fuzzy Matching · Vector Index</i></small>
86
+ """, unsafe_allow_html=True)
87
+
88
+
89
+ if st.session_state["watch_history"]:
90
+ st.divider()
91
+ with st.expander("📜 Recently Searched"):
92
+ st.markdown("👀 Here’s a list of your recent searches:")
93
+ st.markdown("\n".join(f"- {title}" for title in st.session_state["watch_history"]))
94
+
95
+
96
+ st.markdown("---")
97
+ st.markdown(
98
+ "<small>🚀 Built by Rajesh Marudhachalam</small>",
99
+ unsafe_allow_html=True
100
+ )
data/processed/movie_embeddings.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd7e4f70dd34ad2a7663b138e5988cb9a28a090eda8ffb2b9ed725afc9673709
3
+ size 6036450
data/processed/movies_enriched.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ requests==2.31.0
3
+ tqdm==4.66.4
4
+ fuzzywuzzy==0.18.0
5
+ python-Levenshtein==0.12.2
6
+ sentence-transformers==2.7.0
7
+ scikit-learn==1.4.2
8
+ streamlit==1.35.0
9
+ requests==2.31.0
10
+ python-dotenv==1.0.1
11
+ tenacity==8.2.3
scripts/download_movielens.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ import urllib.request
4
+
5
+ DATA_DIR = "data/raw/movielens"
6
+ URL = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
7
+ ZIP_PATH = "data/raw/movielens/ml-1m.zip"
8
+
9
+ os.makedirs(DATA_DIR, exist_ok=True)
10
+
11
+ print("Downloading MovieLens 1M...")
12
+ urllib.request.urlretrieve(URL, ZIP_PATH)
13
+
14
+ with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
15
+ zip_ref.extractall(DATA_DIR)
16
+
17
+ print("✅ Download complete. Extracted to:", DATA_DIR)
scripts/enrich_movies_with_metadata.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from fuzzywuzzy import fuzz
3
+ from tqdm import tqdm
4
+ import ast
5
+
6
+ ml_path = "data/processed/movies.csv"
7
+ tmdb_path = "data/raw/tmdb/movies_metadata.csv"
8
+ out_path = "data/processed/movies_enriched.csv"
9
+
10
+ tqdm.pandas()
11
+
12
+ # Load MovieLens
13
+ ml = pd.read_csv(ml_path)
14
+ ml["CleanTitle"] = ml["Title"].str.extract(r"^(.*)\s\(\d{4}\)", expand=False).str.strip()
15
+ ml.dropna(subset=['CleanTitle'], inplace=True)
16
+ ml['Year'] = ml['Title'].str.extract(r"\((\d{4})\)", expand=False)
17
+ ml['Genres'] = ml['Genres'].str.replace('|', ', ', regex=False)
18
+ ml['CleanTitle'] = ml['CleanTitle'].str.replace(r"\s*\(.*?\)", "", regex=True).str.strip()
19
+
20
+ # Load TMDb metadata
21
+ tmdb = pd.read_csv(tmdb_path, low_memory=False)
22
+ tmdb = tmdb.dropna(subset=["title", "overview"])
23
+ tmdb["title_clean"] = tmdb["title"].str.lower().str.strip()
24
+ tmdb['release_year'] = tmdb['release_date'].str[:4]
25
+ tmdb['genres'] = tmdb['genres'].apply(
26
+ lambda x: ', '.join(d['name'] for d in ast.literal_eval(x) if 'name' in d)
27
+ )
28
+
29
+ # Function to find best fuzzy match
30
+ def get_best_match(title, year):
31
+ choices = tmdb["title_clean"][tmdb["release_year"].astype(str)==year].tolist()
32
+ scores = [(choice, fuzz.token_sort_ratio(str(title).lower(), choice)) for choice in choices]
33
+ best = max(scores, key=lambda x: x[1])
34
+ return best if best[1] > 80 else (None, 0) # threshold
35
+
36
+
37
+ # Apply fuzzy matching
38
+ matches = ml[["CleanTitle", "Year"]].progress_apply(
39
+ lambda x: get_best_match(
40
+ x['CleanTitle'],
41
+ x['Year']
42
+ ),
43
+ axis=1)
44
+ ml["matched_title"] = matches.apply(lambda x: x[0])
45
+ ml["match_score"] = matches.apply(lambda x: x[1])
46
+ ml.dropna(subset=['matched_title'], inplace=True)
47
+
48
+ # Merge on matched title
49
+ merged = ml.merge(tmdb, left_on="matched_title", right_on="title_clean", how="left")
50
+
51
+ def merge_unique_genres(col1, col2):
52
+ # Split by comma and strip whitespace
53
+ list1 = [x.strip() for x in col1.split(',')] if pd.notna(col1) else []
54
+ list2 = [x.strip() for x in col2.split(',')] if pd.notna(col2) else []
55
+
56
+ # Combine while preserving order and removing duplicates
57
+ seen = set()
58
+ merged = []
59
+ for item in list1 + list2:
60
+ if item not in seen:
61
+ seen.add(item)
62
+ merged.append(item)
63
+ return ', '.join(merged)
64
+
65
+ merged['genres'] = merged.apply(lambda row: merge_unique_genres(row['Genres'], row['genres']), axis=1)
66
+
67
+
68
+ # Keep relevant columns
69
+ keep_cols = [
70
+ "MovieID", "Title", "Genres", "CleanTitle",
71
+ "overview", "genres", "release_date", "release_year", "poster_path", "matched_title", "match_score"
72
+ ]
73
+ final = merged[keep_cols]
74
+ final.to_csv(out_path, index=False)
75
+
76
+ print("✅ Enriched metadata saved to:", out_path)
scripts/generate_embeddings.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sentence_transformers import SentenceTransformer
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+
7
+ tqdm.pandas()
8
+
9
+ DATA_PATH = "data/processed/movies_enriched.csv"
10
+ OUTPUT_PATH = "data/processed/movie_embeddings.npz"
11
+
12
+ print("🔍 Loading movie metadata...")
13
+ df = pd.read_csv(DATA_PATH)
14
+ df = df.dropna(subset=["overview"])
15
+ # df["release_year"] = pd.to_datetime(df["release_date"], errors='coerce').dt.year
16
+ print(f"✅ Loaded {len(df)} movies with valid overviews.")
17
+
18
+ # Load Sentence-BERT model
19
+ print("🧠 Loading Sentence-BERT model (all-MiniLM-L6-v2)...")
20
+ model = SentenceTransformer("all-MiniLM-L6-v2")
21
+
22
+ # Generate embeddings with progress bar
23
+ print("⚙️ Generating semantic embeddings...")
24
+ def build_embedding_text(row):
25
+ return f"{row['CleanTitle']} ({row['release_year']}) — {row['genres']}. {row['overview']}"
26
+
27
+ texts = df.apply(build_embedding_text, axis=1).tolist()
28
+
29
+ embeddings = []
30
+ for emb in tqdm(model.encode(texts, batch_size=32, show_progress_bar=False), total=len(texts), desc="📈 Encoding"):
31
+ embeddings.append(emb)
32
+
33
+ embeddings = np.array(embeddings)
34
+
35
+ # Save embeddings and metadata
36
+ print("💾 Saving embeddings and metadata...")
37
+ np.savez_compressed(OUTPUT_PATH,
38
+ embeddings=embeddings,
39
+ titles=df["CleanTitle"].tolist(),
40
+ movie_ids=df["MovieID"].tolist())
41
+
42
+ print("✅ Done! Embeddings saved to:", OUTPUT_PATH)
scripts/preprocess_movielens.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+
4
+ RAW_DIR = "data/raw/movielens/ml-1m"
5
+ PROCESSED_DIR = "data/processed"
6
+ os.makedirs(PROCESSED_DIR, exist_ok=True)
7
+
8
+ # Read .dat files using correct encoding and separator
9
+ users = pd.read_csv(f"{RAW_DIR}/users.dat", sep="::", engine="python", encoding="latin-1",
10
+ names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
11
+
12
+ movies = pd.read_csv(f"{RAW_DIR}/movies.dat", sep="::", engine="python", encoding="latin-1",
13
+ names=["MovieID", "Title", "Genres"])
14
+
15
+ ratings = pd.read_csv(f"{RAW_DIR}/ratings.dat", sep="::", engine="python", encoding="latin-1",
16
+ names=["UserID", "MovieID", "Rating", "Timestamp"])
17
+
18
+ # Save cleaned CSVs
19
+ users.to_csv(f"{PROCESSED_DIR}/users.csv", index=False)
20
+ movies.to_csv(f"{PROCESSED_DIR}/movies.csv", index=False)
21
+ ratings.to_csv(f"{PROCESSED_DIR}/ratings.csv", index=False)
22
+
23
+ print("✅ Preprocessing complete. Cleaned CSVs saved to:", PROCESSED_DIR)
scripts/recommender.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import difflib
2
+ import numpy as np
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer, util
5
+
6
+ def load_data():
7
+ movies = pd.read_csv("data/processed/movies_enriched.csv")
8
+ data = np.load("data/processed/movie_embeddings.npz")
9
+ embeddings = data["embeddings"]
10
+ return movies, embeddings
11
+
12
+ def recommend_movies(movie_title, movies_df, embeddings, top_k=5):
13
+ if not isinstance(movie_title, str):
14
+ movie_title = str(movie_title).strip().lower()
15
+ movie_title = movie_title.strip().lower()
16
+
17
+ # Title matching
18
+ all_titles = movies_df["matched_title"].fillna("").astype(str).tolist()
19
+ match = difflib.get_close_matches(movie_title, all_titles, n=1, cutoff=0.6)
20
+
21
+ if not match:
22
+ return None
23
+
24
+ matched_title = match[0]
25
+ idx = movies_df[movies_df["matched_title"] == matched_title].index[0]
26
+
27
+ # Now instead of comparing embeddings[idx] vs others (which may be weak),
28
+ # encode the *user input* itself
29
+ model = SentenceTransformer("all-MiniLM-L6-v2")
30
+ query_text = movie_title
31
+ query_vec = model.encode(query_text, convert_to_tensor=True)
32
+
33
+ scores = util.cos_sim(query_vec, embeddings)[0].cpu().numpy()
34
+ top_indices = scores.argsort()[::-1][:top_k]
35
+
36
+ results = []
37
+ for i in top_indices:
38
+ row = movies_df.iloc[i]
39
+ poster_url = f"https://image.tmdb.org/t/p/w500{row['poster_path']}" if pd.notna(row["poster_path"]) else None
40
+ results.append({
41
+ "title": row["CleanTitle"],
42
+ "genres": row['genres'],
43
+ "overview": row["overview"],
44
+ "poster_path": poster_url,
45
+ "release_year": row["release_date"][:4] if pd.notna(row["release_date"]) else "Unknown",
46
+ "similarity": float(scores[i])
47
+ })
48
+
49
+ return {
50
+ "input_title": movies_df.iloc[idx]["Title"],
51
+ "results": results
52
+ }
src/streamlit_app.py DELETED
@@ -1,40 +0,0 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))