Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pickle | |
| import polars as pl | |
| import re | |
| import requests | |
| from io import BytesIO | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.neighbors import NearestNeighbors | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import os | |
| import time | |
| # Set page configuration | |
| st.set_page_config( | |
| page_title="Book Recommendation System", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| # GitHub URLs for model files and dataset | |
| GITHUB_CSV_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/goodreadsV2.csv" | |
| GITHUB_KNN_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/knn_model.pkl" | |
| GITHUB_TFIDF_URL = "https://raw.githubusercontent.com/Manithj/bookRecEngine/main/tfidf_vectorizer.pkl" | |
| # Local file paths for saved models and dataset | |
| MODEL_DIR = "models" | |
| DATA_DIR = "data" | |
| KNN_PATH = os.path.join(MODEL_DIR, "knn_model.pkl") | |
| TFIDF_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl") | |
| CSV_PATH = os.path.join(DATA_DIR, "goodreadsV2.csv") | |
| # Create directories if they don't exist | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| # Define the preprocessing function | |
| def preprocess_text(text): | |
| return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower()) | |
| # Download and save files if they don't exist locally | |
| def download_and_save_file(url, save_path, is_binary=True): | |
| if not os.path.exists(save_path): | |
| with st.spinner(f"Downloading {os.path.basename(save_path)}..."): | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| mode = "wb" if is_binary else "w" | |
| with open(save_path, mode) as f: | |
| f.write(response.content) | |
| st.success(f"Downloaded {os.path.basename(save_path)}") | |
| # Add a small delay to ensure file is completely written | |
| time.sleep(1) | |
| else: | |
| st.error(f"Failed to download from {url}, status code: {response.status_code}") | |
| return False | |
| return True | |
| # Load models from local storage or download if needed | |
| def load_models(): | |
| try: | |
| # Download models if they don't exist locally | |
| tfidf_downloaded = download_and_save_file(GITHUB_TFIDF_URL, TFIDF_PATH) | |
| knn_downloaded = download_and_save_file(GITHUB_KNN_URL, KNN_PATH) | |
| if not (tfidf_downloaded and knn_downloaded): | |
| return None, None | |
| # Load models from local storage | |
| with open(TFIDF_PATH, 'rb') as f: | |
| tfidf = pickle.load(f) | |
| with open(KNN_PATH, 'rb') as f: | |
| knn_model = pickle.load(f) | |
| return tfidf, knn_model | |
| except Exception as e: | |
| st.error(f"Error loading models: {e}") | |
| return None, None | |
| # Load the dataset from local storage or download if needed | |
| def load_data(): | |
| try: | |
| # Download dataset if it doesn't exist locally | |
| csv_downloaded = download_and_save_file(GITHUB_CSV_URL, CSV_PATH, is_binary=True) | |
| if not csv_downloaded: | |
| return None | |
| # Load CSV from local storage | |
| df_cleaned = pl.read_csv(CSV_PATH) | |
| # Clean and prepare the data | |
| df_cleaned = df_cleaned.drop_nulls(subset=['name', 'summary', 'genres']) | |
| df_cleaned = df_cleaned.with_columns([ | |
| (pl.col('summary') + ' ' + pl.col('genres')).alias('combined_features') | |
| ]) | |
| # Apply preprocessing | |
| df_cleaned = df_cleaned.with_columns([ | |
| pl.col('combined_features') | |
| .map_elements(preprocess_text, return_dtype=pl.Utf8) | |
| .alias('processed_features') | |
| ]) | |
| return df_cleaned | |
| except Exception as e: | |
| st.error(f"Error loading dataset: {e}") | |
| return None | |
| # Load models and data at startup - this happens only once due to caching | |
| with st.spinner("Loading models and data (this will only happen once)..."): | |
| tfidf, knn_model = load_models() | |
| df_cleaned = load_data() | |
| if tfidf is not None and knn_model is not None and df_cleaned is not None: | |
| models_loaded = True | |
| else: | |
| models_loaded = False | |
| # App title and description | |
| st.title("π Book Recommendation System") | |
| st.markdown("Enter a book summary and genres to get personalized book recommendations!") | |
| if not models_loaded: | |
| st.error("Failed to load models or data. Please check the file paths and URLs.") | |
| else: | |
| st.success("Models and data loaded successfully!") | |
| # Recommendation function for out-of-dataset books | |
| def recommend_books_knn_out_of_dataset(input_summary, input_genres, top_n=5): | |
| # Combine and preprocess the input book's features | |
| combined_input = f"{input_summary} {input_genres}" | |
| processed_input = preprocess_text(combined_input) | |
| # Transform the input book's features using the loaded TF-IDF vectorizer | |
| input_vector = tfidf.transform([processed_input]) | |
| # Find the nearest neighbors using the loaded KNN model | |
| distances, indices = knn_model.kneighbors(input_vector, n_neighbors=top_n) | |
| # Retrieve the recommended book titles and additional information | |
| recommendations = [] | |
| for i, idx in enumerate(indices.flatten()): | |
| book_info = { | |
| "title": df_cleaned['name'][idx], | |
| "summary": df_cleaned['summary'][idx], | |
| "genres": df_cleaned['genres'][idx], | |
| "similarity_score": 1 - distances.flatten()[i] # Convert distance to similarity | |
| } | |
| recommendations.append(book_info) | |
| return recommendations | |
| # Sidebar for inputs | |
| st.sidebar.header("Input Parameters") | |
| # Input fields | |
| input_summary = st.sidebar.text_area("Book Summary", | |
| placeholder="Enter a brief summary of the book...", | |
| height=150) | |
| input_genres = st.sidebar.text_input("Genres", | |
| placeholder="E.g., fantasy, adventure, mystery") | |
| # Number of recommendations slider | |
| num_recommendations = st.sidebar.slider("Number of Recommendations", | |
| min_value=1, | |
| max_value=10, | |
| value=5) | |
| # Get recommendations button | |
| if st.sidebar.button("Get Recommendations") and models_loaded: | |
| if input_summary and input_genres: | |
| with st.spinner("Finding the perfect books for you..."): | |
| # Get recommendations | |
| recommendations = recommend_books_knn_out_of_dataset( | |
| input_summary, | |
| input_genres, | |
| top_n=num_recommendations | |
| ) | |
| # Display recommendations | |
| st.header("Recommended Books") | |
| # Create columns for book cards | |
| cols = st.columns(min(3, num_recommendations)) | |
| for i, book in enumerate(recommendations): | |
| col_idx = i % 3 | |
| with cols[col_idx]: | |
| st.subheader(book["title"]) | |
| st.markdown(f"**Genres:** {book['genres']}") | |
| st.markdown(f"**Similarity Score:** {book['similarity_score']:.2f}") | |
| with st.expander("Summary"): | |
| st.write(book["summary"]) | |
| st.divider() | |
| # Visualization of similarity scores | |
| st.header("Similarity Scores") | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| book_titles = [book["title"] for book in recommendations] | |
| similarity_scores = [book["similarity_score"] for book in recommendations] | |
| # Create horizontal bar chart | |
| sns.barplot(x=similarity_scores, y=book_titles, palette="viridis", ax=ax) | |
| ax.set_xlabel("Similarity Score") | |
| ax.set_ylabel("Book Title") | |
| ax.set_title("Book Recommendation Similarity Scores") | |
| st.pyplot(fig) | |
| else: | |
| st.warning("Please enter both a summary and genres to get recommendations.") | |
| # Add some information about the app | |
| st.sidebar.markdown("---") | |
| st.sidebar.header("About") | |
| st.sidebar.info( | |
| """ | |
| This app uses TF-IDF vectorization and K-Nearest Neighbors to recommend books | |
| based on your input summary and genres. | |
| The recommendations are based on textual similarity between your input and | |
| our database of books from Goodreads. | |
| Models and data are stored locally on the server after initial download. | |
| """ | |
| ) | |
| # Add example inputs for quick testing | |
| st.sidebar.markdown("---") | |
| st.sidebar.header("Try these examples") | |
| if st.sidebar.button("Example 1: Fantasy Adventure"): | |
| st.sidebar.text_area("Book Summary", | |
| value="A young wizard discovers his magical powers and embarks on a journey to defeat a dark lord threatening the world.", | |
| height=150, key="example1_summary") | |
| st.sidebar.text_input("Genres", value="fantasy, adventure, magic", key="example1_genres") | |
| if st.sidebar.button("Example 2: Mystery Thriller"): | |
| st.sidebar.text_area("Book Summary", | |
| value="A detective investigates a series of murders that seem to be connected to an unsolved case from decades ago.", | |
| height=150, key="example2_summary") | |
| st.sidebar.text_input("Genres", value="mystery, thriller, crime", key="example2_genres") | |
| # Add a footer | |
| st.markdown("---") | |
| st.markdown("π Book Recommendation System | Created with Streamlit") | |