File size: 9,626 Bytes
989865d
 
 
 
 
 
 
 
 
 
86b447d
 
989865d
 
 
 
 
 
 
 
 
 
 
 
 
86b447d
 
 
 
 
 
 
 
 
 
 
1e61b24
 
 
 
86b447d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
989865d
86b447d
989865d
86b447d
 
 
 
 
 
989865d
86b447d
 
 
 
 
 
989865d
 
 
 
 
 
86b447d
989865d
86b447d
989865d
86b447d
 
 
 
 
 
 
 
989865d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e61b24
 
86b447d
 
1e61b24
 
 
 
 
 
 
 
 
 
 
86b447d
1e61b24
 
989865d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86b447d
989865d
 
 
1e61b24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
989865d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import streamlit as st
import pickle
import polars as pl
import re
import requests
from io import BytesIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

# Set page configuration
st.set_page_config(
    page_title="Book Recommendation System",
    page_icon="πŸ“š",
    layout="wide"
)

# GitHub URLs for model files and dataset
GITHUB_CSV_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/goodreadsV2.csv"
GITHUB_KNN_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/knn_model.pkl"
GITHUB_TFIDF_URL = "https://raw.githubusercontent.com/Manithj/bookRecEngine/main/tfidf_vectorizer.pkl"

# Local file paths for saved models and dataset
MODEL_DIR = "models"
DATA_DIR = "data"
KNN_PATH = os.path.join(MODEL_DIR, "knn_model.pkl")
TFIDF_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl")
CSV_PATH = os.path.join(DATA_DIR, "goodreadsV2.csv")

# Create directories if they don't exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

# Define the preprocessing function
def preprocess_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())

# Download and save files if they don't exist locally
def download_and_save_file(url, save_path, is_binary=True):
    if not os.path.exists(save_path):
        with st.spinner(f"Downloading {os.path.basename(save_path)}..."):
            response = requests.get(url)
            if response.status_code == 200:
                mode = "wb" if is_binary else "w"
                with open(save_path, mode) as f:
                    f.write(response.content)
                st.success(f"Downloaded {os.path.basename(save_path)}")
                # Add a small delay to ensure file is completely written
                time.sleep(1)
            else:
                st.error(f"Failed to download from {url}, status code: {response.status_code}")
                return False
    return True

# Load models from local storage or download if needed
@st.cache_resource
def load_models():
    try:
        # Download models if they don't exist locally
        tfidf_downloaded = download_and_save_file(GITHUB_TFIDF_URL, TFIDF_PATH)
        knn_downloaded = download_and_save_file(GITHUB_KNN_URL, KNN_PATH)
        
        if not (tfidf_downloaded and knn_downloaded):
            return None, None
        
        # Load models from local storage
        with open(TFIDF_PATH, 'rb') as f:
            tfidf = pickle.load(f)
        
        with open(KNN_PATH, 'rb') as f:
            knn_model = pickle.load(f)
        
        return tfidf, knn_model
    except Exception as e:
        st.error(f"Error loading models: {e}")
        return None, None

# Load the dataset from local storage or download if needed
@st.cache_data
def load_data():
    try:
        # Download dataset if it doesn't exist locally
        csv_downloaded = download_and_save_file(GITHUB_CSV_URL, CSV_PATH, is_binary=True)
        
        if not csv_downloaded:
            return None
        
        # Load CSV from local storage
        df_cleaned = pl.read_csv(CSV_PATH)
        
        # Clean and prepare the data
        df_cleaned = df_cleaned.drop_nulls(subset=['name', 'summary', 'genres'])
        df_cleaned = df_cleaned.with_columns([
            (pl.col('summary') + ' ' + pl.col('genres')).alias('combined_features')
        ])
        
        # Apply preprocessing
        df_cleaned = df_cleaned.with_columns([
            pl.col('combined_features')
            .map_elements(preprocess_text, return_dtype=pl.Utf8)
            .alias('processed_features')
        ])
        
        return df_cleaned
    except Exception as e:
        st.error(f"Error loading dataset: {e}")
        return None

# Load models and data at startup - this happens only once due to caching
with st.spinner("Loading models and data (this will only happen once)..."):
    tfidf, knn_model = load_models()
    df_cleaned = load_data()
    
    if tfidf is not None and knn_model is not None and df_cleaned is not None:
        models_loaded = True
    else:
        models_loaded = False

# App title and description
st.title("πŸ“š Book Recommendation System")
st.markdown("Enter a book summary and genres to get personalized book recommendations!")

if not models_loaded:
    st.error("Failed to load models or data. Please check the file paths and URLs.")
else:
    st.success("Models and data loaded successfully!")

# Recommendation function for out-of-dataset books
def recommend_books_knn_out_of_dataset(input_summary, input_genres, top_n=5):
    # Combine and preprocess the input book's features
    combined_input = f"{input_summary} {input_genres}"
    processed_input = preprocess_text(combined_input)

    # Transform the input book's features using the loaded TF-IDF vectorizer
    input_vector = tfidf.transform([processed_input])

    # Find the nearest neighbors using the loaded KNN model
    distances, indices = knn_model.kneighbors(input_vector, n_neighbors=top_n)

    # Retrieve the recommended book titles and additional information
    recommendations = []
    for i, idx in enumerate(indices.flatten()):
        book_info = {
            "title": df_cleaned['name'][idx],
            "summary": df_cleaned['summary'][idx],
            "genres": df_cleaned['genres'][idx],
            "similarity_score": 1 - distances.flatten()[i]  # Convert distance to similarity
        }
        recommendations.append(book_info)

    return recommendations

# Sidebar for inputs
st.sidebar.header("Input Parameters")

# Input fields
input_summary = st.sidebar.text_area("Book Summary", 
                                    placeholder="Enter a brief summary of the book...",
                                    height=150)

input_genres = st.sidebar.text_input("Genres", 
                                    placeholder="E.g., fantasy, adventure, mystery")

# Number of recommendations slider
num_recommendations = st.sidebar.slider("Number of Recommendations", 
                                        min_value=1, 
                                        max_value=10, 
                                        value=5)

# Get recommendations button
if st.sidebar.button("Get Recommendations") and models_loaded:
    if input_summary and input_genres:
        with st.spinner("Finding the perfect books for you..."):
            # Get recommendations
            recommendations = recommend_books_knn_out_of_dataset(
                input_summary, 
                input_genres, 
                top_n=num_recommendations
            )
            
            # Display recommendations
            st.header("Recommended Books")
            
            # Create columns for book cards
            cols = st.columns(min(3, num_recommendations))
            
            for i, book in enumerate(recommendations):
                col_idx = i % 3
                with cols[col_idx]:
                    st.subheader(book["title"])
                    st.markdown(f"**Genres:** {book['genres']}")
                    st.markdown(f"**Similarity Score:** {book['similarity_score']:.2f}")
                    with st.expander("Summary"):
                        st.write(book["summary"])
                    st.divider()
            
            # Visualization of similarity scores
            st.header("Similarity Scores")
            fig, ax = plt.subplots(figsize=(10, 5))
            
            book_titles = [book["title"] for book in recommendations]
            similarity_scores = [book["similarity_score"] for book in recommendations]
            
            # Create horizontal bar chart
            sns.barplot(x=similarity_scores, y=book_titles, palette="viridis", ax=ax)
            ax.set_xlabel("Similarity Score")
            ax.set_ylabel("Book Title")
            ax.set_title("Book Recommendation Similarity Scores")
            
            st.pyplot(fig)
            
    else:
        st.warning("Please enter both a summary and genres to get recommendations.")

# Add some information about the app
st.sidebar.markdown("---")
st.sidebar.header("About")
st.sidebar.info(
    """
    This app uses TF-IDF vectorization and K-Nearest Neighbors to recommend books 
    based on your input summary and genres.
    
    The recommendations are based on textual similarity between your input and 
    our database of books from Goodreads.
    
    Models and data are stored locally on the server after initial download.
    """
)

# Add example inputs for quick testing
st.sidebar.markdown("---")
st.sidebar.header("Try these examples")

if st.sidebar.button("Example 1: Fantasy Adventure"):
    st.sidebar.text_area("Book Summary", 
                        value="A young wizard discovers his magical powers and embarks on a journey to defeat a dark lord threatening the world.",
                        height=150, key="example1_summary")
    st.sidebar.text_input("Genres", value="fantasy, adventure, magic", key="example1_genres")

if st.sidebar.button("Example 2: Mystery Thriller"):
    st.sidebar.text_area("Book Summary", 
                        value="A detective investigates a series of murders that seem to be connected to an unsolved case from decades ago.",
                        height=150, key="example2_summary")
    st.sidebar.text_input("Genres", value="mystery, thriller, crime", key="example2_genres")

# Add a footer
st.markdown("---")
st.markdown("πŸ“š Book Recommendation System | Created with Streamlit")