Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pickle | |
import polars as pl | |
import re | |
import requests | |
from io import BytesIO | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.neighbors import NearestNeighbors | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import os | |
import time | |
# Set page configuration | |
st.set_page_config( | |
page_title="Book Recommendation System", | |
page_icon="π", | |
layout="wide" | |
) | |
# GitHub URLs for model files and dataset | |
GITHUB_CSV_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/goodreadsV2.csv" | |
GITHUB_KNN_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/knn_model.pkl" | |
GITHUB_TFIDF_URL = "https://raw.githubusercontent.com/Manithj/bookRecEngine/main/tfidf_vectorizer.pkl" | |
# Local file paths for saved models and dataset | |
MODEL_DIR = "models" | |
DATA_DIR = "data" | |
KNN_PATH = os.path.join(MODEL_DIR, "knn_model.pkl") | |
TFIDF_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl") | |
CSV_PATH = os.path.join(DATA_DIR, "goodreadsV2.csv") | |
# Create directories if they don't exist | |
os.makedirs(MODEL_DIR, exist_ok=True) | |
os.makedirs(DATA_DIR, exist_ok=True) | |
# Define the preprocessing function | |
def preprocess_text(text): | |
return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower()) | |
# Download and save files if they don't exist locally | |
def download_and_save_file(url, save_path, is_binary=True): | |
if not os.path.exists(save_path): | |
with st.spinner(f"Downloading {os.path.basename(save_path)}..."): | |
response = requests.get(url) | |
if response.status_code == 200: | |
mode = "wb" if is_binary else "w" | |
with open(save_path, mode) as f: | |
f.write(response.content) | |
st.success(f"Downloaded {os.path.basename(save_path)}") | |
# Add a small delay to ensure file is completely written | |
time.sleep(1) | |
else: | |
st.error(f"Failed to download from {url}, status code: {response.status_code}") | |
return False | |
return True | |
# Load models from local storage or download if needed | |
def load_models(): | |
try: | |
# Download models if they don't exist locally | |
tfidf_downloaded = download_and_save_file(GITHUB_TFIDF_URL, TFIDF_PATH) | |
knn_downloaded = download_and_save_file(GITHUB_KNN_URL, KNN_PATH) | |
if not (tfidf_downloaded and knn_downloaded): | |
return None, None | |
# Load models from local storage | |
with open(TFIDF_PATH, 'rb') as f: | |
tfidf = pickle.load(f) | |
with open(KNN_PATH, 'rb') as f: | |
knn_model = pickle.load(f) | |
return tfidf, knn_model | |
except Exception as e: | |
st.error(f"Error loading models: {e}") | |
return None, None | |
# Load the dataset from local storage or download if needed | |
def load_data(): | |
try: | |
# Download dataset if it doesn't exist locally | |
csv_downloaded = download_and_save_file(GITHUB_CSV_URL, CSV_PATH, is_binary=True) | |
if not csv_downloaded: | |
return None | |
# Load CSV from local storage | |
df_cleaned = pl.read_csv(CSV_PATH) | |
# Clean and prepare the data | |
df_cleaned = df_cleaned.drop_nulls(subset=['name', 'summary', 'genres']) | |
df_cleaned = df_cleaned.with_columns([ | |
(pl.col('summary') + ' ' + pl.col('genres')).alias('combined_features') | |
]) | |
# Apply preprocessing | |
df_cleaned = df_cleaned.with_columns([ | |
pl.col('combined_features') | |
.map_elements(preprocess_text, return_dtype=pl.Utf8) | |
.alias('processed_features') | |
]) | |
return df_cleaned | |
except Exception as e: | |
st.error(f"Error loading dataset: {e}") | |
return None | |
# Load models and data at startup - this happens only once due to caching | |
with st.spinner("Loading models and data (this will only happen once)..."): | |
tfidf, knn_model = load_models() | |
df_cleaned = load_data() | |
if tfidf is not None and knn_model is not None and df_cleaned is not None: | |
models_loaded = True | |
else: | |
models_loaded = False | |
# App title and description | |
st.title("π Book Recommendation System") | |
st.markdown("Enter a book summary and genres to get personalized book recommendations!") | |
if not models_loaded: | |
st.error("Failed to load models or data. Please check the file paths and URLs.") | |
else: | |
st.success("Models and data loaded successfully!") | |
# Recommendation function for out-of-dataset books | |
def recommend_books_knn_out_of_dataset(input_summary, input_genres, top_n=5): | |
# Combine and preprocess the input book's features | |
combined_input = f"{input_summary} {input_genres}" | |
processed_input = preprocess_text(combined_input) | |
# Transform the input book's features using the loaded TF-IDF vectorizer | |
input_vector = tfidf.transform([processed_input]) | |
# Find the nearest neighbors using the loaded KNN model | |
distances, indices = knn_model.kneighbors(input_vector, n_neighbors=top_n) | |
# Retrieve the recommended book titles and additional information | |
recommendations = [] | |
for i, idx in enumerate(indices.flatten()): | |
book_info = { | |
"title": df_cleaned['name'][idx], | |
"summary": df_cleaned['summary'][idx], | |
"genres": df_cleaned['genres'][idx], | |
"similarity_score": 1 - distances.flatten()[i] # Convert distance to similarity | |
} | |
recommendations.append(book_info) | |
return recommendations | |
# Sidebar for inputs | |
st.sidebar.header("Input Parameters") | |
# Input fields | |
input_summary = st.sidebar.text_area("Book Summary", | |
placeholder="Enter a brief summary of the book...", | |
height=150) | |
input_genres = st.sidebar.text_input("Genres", | |
placeholder="E.g., fantasy, adventure, mystery") | |
# Number of recommendations slider | |
num_recommendations = st.sidebar.slider("Number of Recommendations", | |
min_value=1, | |
max_value=10, | |
value=5) | |
# Get recommendations button | |
if st.sidebar.button("Get Recommendations") and models_loaded: | |
if input_summary and input_genres: | |
with st.spinner("Finding the perfect books for you..."): | |
# Get recommendations | |
recommendations = recommend_books_knn_out_of_dataset( | |
input_summary, | |
input_genres, | |
top_n=num_recommendations | |
) | |
# Display recommendations | |
st.header("Recommended Books") | |
# Create columns for book cards | |
cols = st.columns(min(3, num_recommendations)) | |
for i, book in enumerate(recommendations): | |
col_idx = i % 3 | |
with cols[col_idx]: | |
st.subheader(book["title"]) | |
st.markdown(f"**Genres:** {book['genres']}") | |
st.markdown(f"**Similarity Score:** {book['similarity_score']:.2f}") | |
with st.expander("Summary"): | |
st.write(book["summary"]) | |
st.divider() | |
# Visualization of similarity scores | |
st.header("Similarity Scores") | |
fig, ax = plt.subplots(figsize=(10, 5)) | |
book_titles = [book["title"] for book in recommendations] | |
similarity_scores = [book["similarity_score"] for book in recommendations] | |
# Create horizontal bar chart | |
sns.barplot(x=similarity_scores, y=book_titles, palette="viridis", ax=ax) | |
ax.set_xlabel("Similarity Score") | |
ax.set_ylabel("Book Title") | |
ax.set_title("Book Recommendation Similarity Scores") | |
st.pyplot(fig) | |
else: | |
st.warning("Please enter both a summary and genres to get recommendations.") | |
# Add some information about the app | |
st.sidebar.markdown("---") | |
st.sidebar.header("About") | |
st.sidebar.info( | |
""" | |
This app uses TF-IDF vectorization and K-Nearest Neighbors to recommend books | |
based on your input summary and genres. | |
The recommendations are based on textual similarity between your input and | |
our database of books from Goodreads. | |
Models and data are stored locally on the server after initial download. | |
""" | |
) | |
# Add example inputs for quick testing | |
st.sidebar.markdown("---") | |
st.sidebar.header("Try these examples") | |
if st.sidebar.button("Example 1: Fantasy Adventure"): | |
st.sidebar.text_area("Book Summary", | |
value="A young wizard discovers his magical powers and embarks on a journey to defeat a dark lord threatening the world.", | |
height=150, key="example1_summary") | |
st.sidebar.text_input("Genres", value="fantasy, adventure, magic", key="example1_genres") | |
if st.sidebar.button("Example 2: Mystery Thriller"): | |
st.sidebar.text_area("Book Summary", | |
value="A detective investigates a series of murders that seem to be connected to an unsolved case from decades ago.", | |
height=150, key="example2_summary") | |
st.sidebar.text_input("Genres", value="mystery, thriller, crime", key="example2_genres") | |
# Add a footer | |
st.markdown("---") | |
st.markdown("π Book Recommendation System | Created with Streamlit") | |