|
""" |
|
Streamlit Dashboard for DLRM Book Recommendation System |
|
Simple interface for DLRM-based book recommendations |
|
""" |
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
|
|
import pickle |
|
import os |
|
import sys |
|
from typing import Dict, List, Tuple, Optional |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
try: |
|
sys.path.append('.') |
|
from dlrm_inference import DLRMBookRecommender, load_dlrm_recommender |
|
DLRM_AVAILABLE = True |
|
except ImportError as e: |
|
DLRM_AVAILABLE = False |
|
st.error(f"DLRM components not available: {e}") |
|
|
|
|
|
st.set_page_config( |
|
page_title="DLRM Book Recommendations", |
|
page_icon="π", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.main-header { |
|
font-size: 3rem; |
|
color: #1f77b4; |
|
text-align: center; |
|
margin-bottom: 2rem; |
|
} |
|
.metric-card { |
|
background-color: #f0f2f6; |
|
padding: 1rem; |
|
border-radius: 0.5rem; |
|
border-left: 5px solid #1f77b4; |
|
} |
|
.dlrm-explanation { |
|
background-color: #e8f4fd; |
|
padding: 1rem; |
|
border-radius: 0.5rem; |
|
border-left: 4px solid #0066cc; |
|
margin: 1rem 0; |
|
} |
|
.book-card { |
|
background-color: #ffffff; |
|
padding: 1rem; |
|
border-radius: 0.5rem; |
|
border: 1px solid #e1e5eb; |
|
margin-bottom: 1rem; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
@st.cache_data |
|
def load_data(): |
|
"""Load and cache the book data""" |
|
try: |
|
books_df = pd.read_csv('Books.csv', encoding='latin-1', low_memory=False) |
|
users_df = pd.read_csv('Users.csv', encoding='latin-1', low_memory=False) |
|
ratings_df = pd.read_csv('Ratings.csv', encoding='latin-1', low_memory=False) |
|
|
|
|
|
books_df.columns = books_df.columns.str.replace('"', '') |
|
users_df.columns = users_df.columns.str.replace('"', '') |
|
ratings_df.columns = ratings_df.columns.str.replace('"', '') |
|
|
|
return books_df, users_df, ratings_df |
|
except Exception as e: |
|
st.error(f"Error loading data: {e}") |
|
return None, None, None |
|
|
|
@st.cache_resource |
|
def load_dlrm_model(): |
|
"""Load and cache the DLRM model""" |
|
if not DLRM_AVAILABLE: |
|
return None |
|
|
|
try: |
|
recommender = load_dlrm_recommender("file") |
|
return recommender |
|
except Exception as e: |
|
st.error(f"Error loading DLRM model: {e}") |
|
return None |
|
|
|
def display_book_info(book_isbn, books_df, show_rating=None): |
|
"""Display book information with actual book cover""" |
|
book_info = books_df[books_df['ISBN'] == book_isbn] |
|
|
|
if len(book_info) == 0: |
|
st.write(f"Book with ISBN {book_isbn} not found") |
|
return |
|
|
|
book = book_info.iloc[0] |
|
|
|
col1, col2 = st.columns([1, 3]) |
|
|
|
with col1: |
|
|
|
image_url = book.get('Image-URL-M', '') |
|
|
|
if image_url and pd.notna(image_url) and str(image_url) != 'nan': |
|
try: |
|
|
|
clean_url = str(image_url).strip() |
|
if clean_url and 'http' in clean_url: |
|
st.image(clean_url, width=150, caption="π") |
|
else: |
|
|
|
st.image("https://via.placeholder.com/150x200?text=π&color=1f77b4&bg=f0f2f6", width=150) |
|
except Exception as e: |
|
|
|
st.image("https://via.placeholder.com/150x200?text=π&color=1f77b4&bg=f0f2f6", width=150) |
|
st.caption("β οΈ Cover unavailable") |
|
else: |
|
|
|
st.image("https://via.placeholder.com/150x200?text=π&color=1f77b4&bg=f0f2f6", width=150) |
|
st.caption("π No cover") |
|
|
|
with col2: |
|
st.markdown(f"**{book['Book-Title']}**") |
|
st.write(f"*by {book['Book-Author']}*") |
|
st.write(f"π
Published: {book.get('Year-Of-Publication', 'Unknown')}") |
|
st.write(f"π’ Publisher: {book.get('Publisher', 'Unknown')}") |
|
st.write(f"π ISBN: {book['ISBN']}") |
|
|
|
if show_rating is not None: |
|
st.markdown(f"**π― DLRM Score: {show_rating:.4f}**") |
|
|
|
def main(): |
|
|
|
st.markdown('<h1 class="main-header">π DLRM Book Recommendation System</h1>', unsafe_allow_html=True) |
|
st.markdown("### Deep Learning Recommendation Model for Personalized Book Suggestions") |
|
st.markdown("---") |
|
|
|
if not DLRM_AVAILABLE: |
|
st.error("DLRM components are not available. Please ensure TorchRec is properly installed.") |
|
st.info("To install TorchRec: `pip install torchrec`") |
|
return |
|
|
|
|
|
with st.spinner("Loading book data..."): |
|
books_df, users_df, ratings_df = load_data() |
|
|
|
if books_df is None: |
|
st.error("Failed to load data. Please check if CSV files are available.") |
|
return |
|
|
|
|
|
st.sidebar.title("π Dataset Information") |
|
st.sidebar.metric("π Books", f"{len(books_df):,}") |
|
st.sidebar.metric("π₯ Users", f"{len(users_df):,}") |
|
st.sidebar.metric("β Ratings", f"{len(ratings_df):,}") |
|
|
|
|
|
with st.spinner("Loading DLRM model..."): |
|
recommender = load_dlrm_model() |
|
|
|
if recommender is None or recommender.model is None: |
|
st.error("β DLRM model not available") |
|
st.info("Please run the training script first: `python train_dlrm_books.py`") |
|
|
|
st.markdown("### Available Options:") |
|
st.markdown("1. **Train DLRM Model**: Run `python train_dlrm_books.py`") |
|
st.markdown("2. **Prepare Data**: Run `python dlrm_book_recommender.py`") |
|
st.markdown("3. **Check Files**: Ensure preprocessing files exist") |
|
|
|
return |
|
|
|
st.success("β
DLRM model loaded successfully!") |
|
|
|
|
|
st.sidebar.markdown("---") |
|
st.sidebar.subheader("π€ DLRM Model Info") |
|
if recommender.preprocessing_info: |
|
st.sidebar.write(f"Dense features: {len(recommender.dense_cols)}") |
|
st.sidebar.write(f"Categorical features: {len(recommender.cat_cols)}") |
|
st.sidebar.write(f"Embedding dim: 64") |
|
|
|
|
|
tab1, tab2, tab3, tab4 = st.tabs(["π― Get Recommendations", "π Test Predictions", "π Model Analysis", "πΈ Book Gallery"]) |
|
|
|
with tab1: |
|
st.header("π― DLRM Book Recommendations") |
|
st.info("Get personalized book recommendations using the trained DLRM model") |
|
|
|
|
|
col1, col2 = st.columns([2, 1]) |
|
|
|
with col1: |
|
user_ids = sorted(users_df['User-ID'].unique()) |
|
selected_user_id = st.selectbox("Select a user", user_ids[:1000]) |
|
|
|
with col2: |
|
num_recommendations = st.slider("Number of recommendations", 5, 20, 10) |
|
|
|
|
|
user_info = users_df[users_df['User-ID'] == selected_user_id] |
|
if len(user_info) > 0: |
|
user = user_info.iloc[0] |
|
st.markdown(f"**User Info**: Age: {user.get('Age', 'Unknown')}, Location: {user.get('Location', 'Unknown')}") |
|
|
|
|
|
user_ratings = ratings_df[ratings_df['User-ID'] == selected_user_id] |
|
if len(user_ratings) > 0: |
|
with st.expander(f"π User's Reading History ({len(user_ratings)} books)", expanded=False): |
|
top_rated = user_ratings.sort_values('Book-Rating', ascending=False).head(10) |
|
for _, rating in top_rated.iterrows(): |
|
book_info = books_df[books_df['ISBN'] == rating['ISBN']] |
|
if len(book_info) > 0: |
|
book = book_info.iloc[0] |
|
st.write(f"β’ **{book['Book-Title']}** by {book['Book-Author']} - {rating['Book-Rating']}/10 β") |
|
|
|
if st.button("π Get DLRM Recommendations", type="primary"): |
|
with st.spinner("π€ DLRM is analyzing user preferences..."): |
|
|
|
|
|
user_rated_books = set(user_ratings['ISBN']) if len(user_ratings) > 0 else set() |
|
|
|
|
|
book_popularity = ratings_df.groupby('ISBN').size().sort_values(ascending=False) |
|
candidate_books = [isbn for isbn in book_popularity.head(100).index if isbn not in user_rated_books] |
|
|
|
if len(candidate_books) < num_recommendations: |
|
candidate_books = book_popularity.head(200).index.tolist() |
|
|
|
|
|
recommendations = recommender.get_user_recommendations( |
|
user_id=selected_user_id, |
|
candidate_books=candidate_books, |
|
k=num_recommendations |
|
) |
|
|
|
if recommendations: |
|
st.success(f"Generated {len(recommendations)} DLRM recommendations!") |
|
|
|
st.subheader("π― DLRM Recommendations") |
|
|
|
for i, (book_isbn, score) in enumerate(recommendations, 1): |
|
book_info = books_df[books_df['ISBN'] == book_isbn] |
|
if len(book_info) > 0: |
|
with st.expander(f"{i}. Recommendation (DLRM Score: {score:.4f})", expanded=(i <= 3)): |
|
display_book_info(book_isbn, books_df, show_rating=score) |
|
|
|
|
|
book_ratings = ratings_df[ratings_df['ISBN'] == book_isbn] |
|
if len(book_ratings) > 0: |
|
avg_rating = book_ratings['Book-Rating'].mean() |
|
num_ratings = len(book_ratings) |
|
|
|
st.markdown('<div class="dlrm-explanation">', unsafe_allow_html=True) |
|
st.markdown("**π Book Statistics:**") |
|
st.write(f"Average Rating: {avg_rating:.1f}/10 from {num_ratings} readers") |
|
st.write(f"DLRM Confidence: {score:.1%}") |
|
st.markdown('</div>', unsafe_allow_html=True) |
|
else: |
|
st.write(f"Book with ISBN {book_isbn} not found in database") |
|
else: |
|
st.warning("No recommendations generated") |
|
|
|
with tab2: |
|
st.header("π Test DLRM Predictions") |
|
st.info("Test how well DLRM predicts actual user ratings") |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
test_user_id = st.selectbox("Select user for testing", user_ids[:500], key="test_user") |
|
|
|
with col2: |
|
test_mode = st.radio("Test mode", ["Random books", "User's actual books"]) |
|
|
|
if st.button("π§ͺ Test Predictions", type="secondary"): |
|
with st.spinner("Testing DLRM predictions..."): |
|
|
|
if test_mode == "User's actual books": |
|
|
|
user_test_ratings = ratings_df[ratings_df['User-ID'] == test_user_id].sample(min(10, len(user_ratings))) |
|
|
|
if len(user_test_ratings) > 0: |
|
st.subheader("π― DLRM vs Actual Ratings") |
|
|
|
predictions = [] |
|
actuals = [] |
|
|
|
for _, rating in user_test_ratings.iterrows(): |
|
book_isbn = rating['ISBN'] |
|
actual_rating = rating['Book-Rating'] |
|
|
|
|
|
dlrm_score = recommender.predict_rating(test_user_id, book_isbn) |
|
|
|
predictions.append(dlrm_score) |
|
actuals.append(actual_rating >= 6) |
|
|
|
|
|
book_info = books_df[books_df['ISBN'] == book_isbn] |
|
if len(book_info) > 0: |
|
book = book_info.iloc[0] |
|
|
|
col1, col2, col3 = st.columns([2, 1, 1]) |
|
with col1: |
|
st.write(f"**{book['Book-Title']}**") |
|
st.write(f"*by {book['Book-Author']}*") |
|
|
|
with col2: |
|
st.metric("Actual Rating", f"{actual_rating}/10") |
|
|
|
with col3: |
|
st.metric("DLRM Score", f"{dlrm_score:.3f}") |
|
|
|
|
|
if predictions and actuals: |
|
|
|
binary_preds = [1 if p > 0.5 else 0 for p in predictions] |
|
accuracy = sum(p == a for p, a in zip(binary_preds, actuals)) / len(actuals) |
|
|
|
st.markdown("---") |
|
st.success(f"π― DLRM Accuracy: {accuracy:.1%}") |
|
|
|
|
|
actual_numeric = [rating['Book-Rating'] for _, rating in user_test_ratings.iterrows()] |
|
correlation = np.corrcoef(predictions, actual_numeric)[0, 1] if len(predictions) > 1 else 0 |
|
st.info(f"π Correlation with actual ratings: {correlation:.3f}") |
|
|
|
else: |
|
st.warning("No ratings found for this user") |
|
|
|
else: |
|
|
|
random_books = books_df.sample(10)['ISBN'].tolist() |
|
|
|
st.subheader("π² Random Book Predictions") |
|
|
|
for book_isbn in random_books: |
|
dlrm_score = recommender.predict_rating(test_user_id, book_isbn) |
|
|
|
book_info = books_df[books_df['ISBN'] == book_isbn] |
|
if len(book_info) > 0: |
|
book = book_info.iloc[0] |
|
|
|
col1, col2 = st.columns([3, 1]) |
|
with col1: |
|
st.write(f"**{book['Book-Title']}** by *{book['Book-Author']}*") |
|
|
|
with col2: |
|
st.metric("DLRM Score", f"{dlrm_score:.4f}") |
|
|
|
with tab3: |
|
st.header("π DLRM Model Analysis") |
|
st.info("Analysis of the DLRM model performance and characteristics") |
|
|
|
|
|
if recommender and recommender.preprocessing_info: |
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.subheader("ποΈ Model Architecture") |
|
st.write(f"**Dense Features ({len(recommender.dense_cols)}):**") |
|
for col in recommender.dense_cols: |
|
st.write(f"β’ {col}") |
|
|
|
st.write(f"**Categorical Features ({len(recommender.cat_cols)}):**") |
|
for i, col in enumerate(recommender.cat_cols): |
|
st.write(f"β’ {col}: {recommender.emb_counts[i]} embeddings") |
|
|
|
with col2: |
|
st.subheader("π Dataset Statistics") |
|
total_samples = recommender.preprocessing_info.get('total_samples', 0) |
|
positive_rate = recommender.preprocessing_info.get('positive_rate', 0) |
|
|
|
st.metric("Total Samples", f"{total_samples:,}") |
|
st.metric("Positive Rate", f"{positive_rate:.1%}") |
|
st.metric("Train Samples", f"{recommender.preprocessing_info.get('train_samples', 0):,}") |
|
st.metric("Validation Samples", f"{recommender.preprocessing_info.get('val_samples', 0):,}") |
|
st.metric("Test Samples", f"{recommender.preprocessing_info.get('test_samples', 0):,}") |
|
|
|
|
|
st.subheader("π Feature Analysis") |
|
|
|
if st.button("Analyze Feature Importance"): |
|
with st.spinner("Analyzing feature importance..."): |
|
|
|
|
|
sample_users = users_df['User-ID'].sample(20).tolist() |
|
sample_books = books_df['ISBN'].sample(20).tolist() |
|
|
|
|
|
st.write("**Feature Impact Analysis:**") |
|
|
|
base_predictions = [] |
|
for user_id in sample_users[:5]: |
|
for book_isbn in sample_books[:5]: |
|
score = recommender.predict_rating(user_id, book_isbn) |
|
base_predictions.append(score) |
|
|
|
avg_prediction = np.mean(base_predictions) |
|
st.metric("Average Prediction Score", f"{avg_prediction:.4f}") |
|
|
|
st.success("β
Feature analysis completed!") |
|
|
|
|
|
if os.path.exists('dlrm_book_training_results.pkl'): |
|
with open('dlrm_book_training_results.pkl', 'rb') as f: |
|
training_results = pickle.load(f) |
|
|
|
st.subheader("π Training Results") |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.metric("Final Validation AUROC", f"{training_results.get('final_val_auroc', 0):.4f}") |
|
st.metric("Test AUROC", f"{training_results.get('test_auroc', 0):.4f}") |
|
|
|
with col2: |
|
val_history = training_results.get('val_aurocs_history', []) |
|
if val_history: |
|
st.line_chart(pd.DataFrame({ |
|
'Epoch': range(len(val_history)), |
|
'Validation AUROC': val_history |
|
}).set_index('Epoch')) |
|
|
|
|
|
st.markdown("---") |
|
st.markdown(""" |
|
## π How DLRM Works for Book Recommendations |
|
|
|
**DLRM (Deep Learning Recommendation Model)** is specifically designed for recommendation systems and offers several advantages: |
|
|
|
### ποΈ Architecture Benefits: |
|
- **Multi-feature Processing**: Handles both categorical (user ID, book ID, publisher) and numerical (age, ratings) features |
|
- **Embedding Tables**: Learns rich representations for categorical features |
|
- **Cross-feature Interactions**: Captures complex relationships between different features |
|
- **Scalable Design**: Efficiently handles large-scale recommendation datasets |
|
|
|
### π Features Used: |
|
**Categorical Features:** |
|
- User ID, Book ID, Publisher, Country, Age Group, Publication Decade, Rating Level |
|
|
|
**Dense Features:** |
|
- Normalized Age, Publication Year, User Activity, Book Popularity, Average Ratings |
|
|
|
### π― Why DLRM vs LLM for Recommendations: |
|
- **Purpose-built**: Specifically designed for recommendation systems |
|
- **Feature Integration**: Better at combining diverse feature types |
|
- **Scalability**: More efficient for large-scale recommendation tasks |
|
- **Performance**: Higher accuracy for rating prediction tasks |
|
- **Production Ready**: Optimized for real-time inference |
|
|
|
### π‘ Best Use Cases: |
|
- **Personalized Recommendations**: Based on user behavior and item characteristics |
|
- **Rating Prediction**: Accurately predicts user preferences |
|
- **Cold Start**: Handles new users and items through content features |
|
- **Real-time Serving**: Fast inference for production systems |
|
""") |
|
|
|
with tab4: |
|
st.header("πΈ Book Gallery") |
|
st.info("Browse book covers and discover new titles") |
|
|
|
|
|
col1, col2 = st.columns([2, 1]) |
|
|
|
with col1: |
|
gallery_mode = st.selectbox( |
|
"Choose gallery mode", |
|
["Popular Books", "Recent Publications", "Random Selection", "Search Results"] |
|
) |
|
|
|
with col2: |
|
books_per_row = st.slider("Books per row", 2, 6, 4) |
|
max_books = st.slider("Maximum books", 10, 50, 20) |
|
|
|
|
|
if gallery_mode == "Popular Books": |
|
|
|
book_popularity = ratings_df.groupby('ISBN').size().sort_values(ascending=False) |
|
gallery_books = books_df[books_df['ISBN'].isin(book_popularity.head(max_books).index)] |
|
|
|
elif gallery_mode == "Recent Publications": |
|
|
|
books_df_temp = books_df.copy() |
|
books_df_temp['Year-Of-Publication'] = pd.to_numeric(books_df_temp['Year-Of-Publication'], errors='coerce') |
|
recent_books = books_df_temp.sort_values('Year-Of-Publication', ascending=False, na_position='last') |
|
gallery_books = recent_books.head(max_books) |
|
|
|
elif gallery_mode == "Random Selection": |
|
|
|
gallery_books = books_df.sample(min(max_books, len(books_df))) |
|
|
|
else: |
|
search_query = st.text_input("Search books for gallery", placeholder="Enter title, author, or publisher") |
|
if search_query: |
|
mask = ( |
|
books_df['Book-Title'].str.contains(search_query, case=False, na=False) | |
|
books_df['Book-Author'].str.contains(search_query, case=False, na=False) | |
|
books_df['Publisher'].str.contains(search_query, case=False, na=False) |
|
) |
|
gallery_books = books_df[mask].head(max_books) |
|
else: |
|
gallery_books = books_df.head(max_books) |
|
|
|
|
|
if len(gallery_books) > 0: |
|
st.markdown(f"**π Showing {len(gallery_books)} books**") |
|
|
|
|
|
books_list = gallery_books.to_dict('records') |
|
|
|
|
|
for i in range(0, len(books_list), books_per_row): |
|
cols = st.columns(books_per_row) |
|
|
|
for j, col in enumerate(cols): |
|
if i + j < len(books_list): |
|
book = books_list[i + j] |
|
|
|
with col: |
|
|
|
image_url = book.get('Image-URL-M', '') |
|
|
|
if image_url and pd.notna(image_url) and str(image_url) != 'nan': |
|
try: |
|
clean_url = str(image_url).strip() |
|
if clean_url and 'http' in clean_url: |
|
st.image(clean_url, width='stretch') |
|
else: |
|
st.image("https://via.placeholder.com/150x200?text=π&color=1f77b4&bg=f0f2f6", width='stretch') |
|
except: |
|
st.image("https://via.placeholder.com/150x200?text=π&color=1f77b4&bg=f0f2f6", width='stretch') |
|
else: |
|
st.image("https://via.placeholder.com/150x200?text=π&color=1f77b4&bg=f0f2f6", width='stretch') |
|
|
|
|
|
title = book['Book-Title'] |
|
if len(title) > 40: |
|
title = title[:37] + "..." |
|
|
|
author = book['Book-Author'] |
|
if len(author) > 25: |
|
author = author[:22] + "..." |
|
|
|
st.markdown(f"**{title}**") |
|
st.write(f"*{author}*") |
|
st.write(f"π
{book.get('Year-Of-Publication', 'Unknown')}") |
|
|
|
|
|
book_stats = ratings_df[ratings_df['ISBN'] == book['ISBN']] |
|
if len(book_stats) > 0: |
|
avg_rating = book_stats['Book-Rating'].mean() |
|
num_ratings = len(book_stats) |
|
st.write(f"β {avg_rating:.1f}/10 ({num_ratings} ratings)") |
|
else: |
|
st.write("β No ratings") |
|
|
|
|
|
if recommender and recommender.model: |
|
if st.button(f"π― DLRM Score", key=f"dlrm_{book['ISBN']}"): |
|
with st.spinner("Calculating..."): |
|
|
|
sample_user = users_df['User-ID'].iloc[0] |
|
dlrm_score = recommender.predict_rating(sample_user, book['ISBN']) |
|
st.success(f"DLRM Score: {dlrm_score:.3f}") |
|
else: |
|
st.info("No books found for the selected criteria") |
|
|
|
|
|
st.markdown("---") |
|
st.subheader("π Gallery Statistics") |
|
|
|
col1, col2, col3, col4 = st.columns(4) |
|
|
|
with col1: |
|
books_with_covers = sum(1 for _, book in gallery_books.iterrows() |
|
if book.get('Image-URL-M') and pd.notna(book.get('Image-URL-M'))) |
|
st.metric("Books with Covers", f"{books_with_covers}/{len(gallery_books)}") |
|
|
|
with col2: |
|
|
|
years = pd.to_numeric(gallery_books['Year-Of-Publication'], errors='coerce') |
|
avg_year = years.mean() |
|
st.metric("Average Publication Year", f"{avg_year:.0f}" if not pd.isna(avg_year) else "Unknown") |
|
|
|
with col3: |
|
unique_authors = gallery_books['Book-Author'].nunique() |
|
st.metric("Unique Authors", unique_authors) |
|
|
|
with col4: |
|
unique_publishers = gallery_books['Publisher'].nunique() |
|
st.metric("Unique Publishers", unique_publishers) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|