rajkhanke's picture
Update app.py
662c070 verified
from flask import Flask, render_template, request, jsonify, current_app
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import os
import logging
# --- Logging Configuration ---
# Ensure logging is configured before any loggers are potentially used by imported modules
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]')
logger = logging.getLogger(__name__)
app = Flask(__name__)
# --- Global Variables ---
DF = None
ALL_TOPPINGS = []
FEATURE_DF = None
SCALER = None # Will be initialized in preprocess_data
NUMERICAL_COLS = ['Price', 'Slices', 'Rating', 'Spice_Level', 'Preparation_Time', 'Calories']
CATEGORICAL_FEATURES = [
'Serving_Size', 'Popular_Group', 'Dietary_Category',
'Sauce_Type', 'Cheese_Amount', 'Restaurant_Chain',
'Seasonal_Availability', 'Bread_Type'
]
CRUST_TYPE_COL = None
DEFAULT_IMAGE_URL = 'https://images.dominos.co.in/new_margherita_2502.jpg'
def preprocess_data(df_path='pizza.csv'):
global DF, ALL_TOPPINGS, FEATURE_DF, SCALER, CATEGORICAL_FEATURES, CRUST_TYPE_COL
logger.info(f"Attempting to preprocess data from relative path: {df_path}")
# Construct absolute path for the CSV file
# This is crucial for environments like Docker where working directory might differ
base_dir = os.path.dirname(os.path.abspath(__file__)) # Directory of the current script (app.py)
absolute_df_path = os.path.join(base_dir, df_path)
logger.info(f"Absolute path for CSV: {absolute_df_path}")
if not os.path.exists(absolute_df_path):
logger.error(f"Dataset file '{absolute_df_path}' not found.")
raise FileNotFoundError(f"Dataset file '{absolute_df_path}' not found. Ensure it's in the same directory as app.py.")
DF = pd.read_csv(absolute_df_path)
logger.info(f"Successfully loaded '{absolute_df_path}'. Original DataFrame shape: {DF.shape}")
logger.info(f"Original DataFrame columns: {DF.columns.tolist()}")
# Determine Crust Type Column
potential_crust_cols = ['Crust_Type', 'Cr_Type']
valid_crust_cols = [col for col in potential_crust_cols if col in DF.columns]
if valid_crust_cols:
valid_crust_cols.sort(key=lambda col: DF[col].isnull().sum()) # Prefer column with fewer NaNs
CRUST_TYPE_COL = valid_crust_cols[0]
logger.info(f"Using '{CRUST_TYPE_COL}' for crust type.")
if CRUST_TYPE_COL not in CATEGORICAL_FEATURES:
CATEGORICAL_FEATURES.append(CRUST_TYPE_COL)
# Remove other potential crust columns if they were in CATEGORICAL_FEATURES
for col in potential_crust_cols:
if col != CRUST_TYPE_COL and col in CATEGORICAL_FEATURES:
CATEGORICAL_FEATURES.remove(col)
else:
logger.warning("Crust type column (Crust_Type or Cr_Type) not found. Crust type will not be used.")
CRUST_TYPE_COL = None
# Fill NaN for text-based categorical columns and other text fields
text_cols_to_fill = list(set(CATEGORICAL_FEATURES + ['Toppings', 'Description', 'Allergens', 'Image_Url', 'Pizza_Name']))
for col in text_cols_to_fill:
if col and col in DF.columns: # Ensure col is not None (e.g. if CRUST_TYPE_COL is None)
DF[col] = DF[col].fillna('')
logger.info("Filled NaNs in text-based categorical columns with empty strings.")
# Fill NaN for numerical columns from the CSV
numerical_cols_in_df = ['Price_Rs', 'Slices', 'Rating', 'Rating_Count', 'Preparation_Time_min', 'Calories_per_Slice']
for col in numerical_cols_in_df:
if col in DF.columns:
if pd.api.types.is_numeric_dtype(DF[col]):
median_val = DF[col].median()
DF[col] = DF[col].fillna(median_val)
logger.info(f"Filled NaNs in numerical column '{col}' with its median ({median_val}).")
else:
# Attempt to convert to numeric, then fill with median or 0
numeric_series = pd.to_numeric(DF[col], errors='coerce')
median_val = 0
if not numeric_series.isnull().all():
median_val = numeric_series.median()
DF[col] = numeric_series.fillna(median_val)
logger.warning(f"Column '{col}' was not purely numeric. Converted to numeric, filled NaNs with median/0 ({median_val}).")
else:
logger.warning(f"Expected numerical column '{col}' not found in DataFrame. It will be missing from features if not handled.")
if 'Rating_Count' in DF.columns:
DF['Rating_Count'] = DF['Rating_Count'].fillna(0).astype(int)
# Process Toppings
if 'Toppings' in DF.columns:
DF['Toppings_list_internal'] = DF['Toppings'].astype(str).str.split(r';\s*') # Use raw string for regex
DF['Toppings_list_internal'] = DF['Toppings_list_internal'].apply(
lambda x: [t.strip() for t in x if isinstance(t, str) and t.strip()]) # Filter out empty strings after split
current_all_toppings = set()
for toppings_list in DF['Toppings_list_internal'].dropna():
current_all_toppings.update(t for t in toppings_list if t) # Ensure t is not empty
ALL_TOPPINGS = sorted(list(current_all_toppings))
logger.info(f"Found {len(ALL_TOPPINGS)} unique toppings. Example: {ALL_TOPPINGS[:5] if ALL_TOPPINGS else 'None'}")
else:
logger.warning("'Toppings' column not found. Topping features will be empty.")
DF['Toppings_list_internal'] = pd.Series([[] for _ in range(len(DF))]) # Empty list for all rows
ALL_TOPPINGS = []
# --- Feature Engineering ---
feature_data = {}
num_feature_map = {
'Price': 'Price_Rs', 'Slices': 'Slices', 'Rating': 'Rating',
'Preparation_Time': 'Preparation_Time_min', 'Calories': 'Calories_per_Slice'
}
for feature_col, df_col in num_feature_map.items():
if df_col in DF.columns:
feature_data[feature_col] = DF[df_col].copy()
else:
logger.warning(f"Numerical source column '{df_col}' for feature '{feature_col}' not found. Filling with zeros.")
feature_data[feature_col] = pd.Series([0.0] * len(DF)) # Ensure float for consistency
# Spice Level Feature (Numerical)
if 'Spice_Level' in DF.columns:
DF['Spice_Level'] = DF['Spice_Level'].fillna('Mild') # Default for NaNs
spice_map = {'Mild': 1, 'Medium': 2, 'Hot': 3}
feature_data['Spice_Level'] = DF['Spice_Level'].map(spice_map).fillna(1.0) # Ensure float
else:
logger.warning("'Spice_Level' column not found. Filling 'Spice_Level' feature with default (1.0).")
feature_data['Spice_Level'] = pd.Series([1.0] * len(DF)) # Default if column is missing
# One-Hot Encode Categorical Features
for feature_cat_col in CATEGORICAL_FEATURES:
if feature_cat_col and feature_cat_col in DF.columns: # Check if col_name is not None and exists
# Ensure the column is treated as string to avoid issues with mixed types in unique()
DF[feature_cat_col] = DF[feature_cat_col].astype(str)
for value in DF[feature_cat_col].unique():
if pd.notnull(value) and value.strip() != '': # Check for non-null and non-empty string values
feature_data[f"{feature_cat_col}_{value}"] = (DF[feature_cat_col] == value).astype(int)
elif feature_cat_col: # Log warning only if feature_cat_col was defined
logger.warning(f"Categorical source column '{feature_cat_col}' for one-hot encoding not found in DataFrame.")
# Topping Features (One-Hot Encoded)
for topping in ALL_TOPPINGS:
if topping: # Ensure topping string is not empty
feature_data[f"Topping_{topping}"] = DF['Toppings_list_internal'].apply(
lambda x: 1 if topping in x else 0
)
FEATURE_DF = pd.DataFrame(feature_data)
logger.info(f"FEATURE_DF created. Shape: {FEATURE_DF.shape}. Columns: {FEATURE_DF.columns.tolist()[:10]}...") # Log first 10 cols
# Ensure all NUMERICAL_COLS exist in FEATURE_DF and fill NaNs
for col in NUMERICAL_COLS:
if col not in FEATURE_DF.columns:
logger.warning(f"Numerical column '{col}' is missing from FEATURE_DF after construction. Adding as zeros.")
FEATURE_DF[col] = 0.0 # Ensure float
if FEATURE_DF[col].isnull().any():
mean_val = FEATURE_DF[col].mean()
fill_val = mean_val if pd.notna(mean_val) else 0.0
logger.info(f"Filling NaNs in numerical feature column '{col}' with {fill_val}.")
FEATURE_DF[col] = FEATURE_DF[col].fillna(fill_val)
# Scale Numerical Features
SCALER = MinMaxScaler() # Initialize scaler
if not FEATURE_DF.empty and all(col in FEATURE_DF.columns for col in NUMERICAL_COLS):
try:
FEATURE_DF[NUMERICAL_COLS] = SCALER.fit_transform(FEATURE_DF[NUMERICAL_COLS])
logger.info(f"Numerical columns ({NUMERICAL_COLS}) scaled. FEATURE_DF shape: {FEATURE_DF.shape}")
except Exception as e:
logger.error(f"Error during scaling of numerical columns: {e}. FEATURE_DF might be problematic.")
# Fallback: Keep numerical columns unscaled if scaling fails, or handle as needed
elif FEATURE_DF.empty:
logger.error("FEATURE_DF is empty before scaling. Scaling skipped. This will likely cause issues.")
else:
missing_cols = [col for col in NUMERICAL_COLS if col not in FEATURE_DF.columns]
logger.error(f"Not all numerical columns ({NUMERICAL_COLS}) found in FEATURE_DF for scaling. Missing: {missing_cols}. Scaling skipped.")
logger.info(f"Preprocessing done. DF is None: {DF is None}, FEATURE_DF is None: {FEATURE_DF is None}, SCALER is None: {SCALER is None}")
if FEATURE_DF is not None:
logger.info(f"Final FEATURE_DF shape: {FEATURE_DF.shape}")
if DF is not None:
logger.info(f"Final DF shape: {DF.shape}")
@app.route('/')
def index_route():
global DF, ALL_TOPPINGS, CATEGORICAL_FEATURES, CRUST_TYPE_COL, FEATURE_DF, DEFAULT_IMAGE_URL
# Critical check at the beginning of the route
if DF is None:
current_app.logger.error("DF is None when trying to serve '/'. Data preprocessing might have failed or not run.")
return "Error: Pizza data (DF) not loaded. Please check server logs.", 500
if FEATURE_DF is None: # Also check FEATURE_DF as it's derived
current_app.logger.error("FEATURE_DF is None when trying to serve '/'. Data preprocessing might have failed.")
return "Error: Pizza feature data (FEATURE_DF) not loaded. Please check server logs.", 500
filter_options = {}
# Ensure 'Spice_Level' is included for filter options if it exists in DF
cols_for_filters_set = set(cat_col for cat_col in CATEGORICAL_FEATURES if cat_col and cat_col in DF.columns) # Filter out None or non-existent
if 'Spice_Level' in DF.columns:
cols_for_filters_set.add('Spice_Level')
# CRUST_TYPE_COL is already in CATEGORICAL_FEATURES if found
for col_name in list(cols_for_filters_set):
# key_name for JS should be consistent (lowercase, no underscores)
key_name = col_name.lower().replace('_', '')
# No special handling for spicelevel or crusttype here, it's naturally handled by the line above.
unique_values = sorted([v for v in DF[col_name].astype(str).dropna().unique() if v.strip() != ''])
if unique_values: # Only add if there are actual values
filter_options[key_name] = unique_values
# Prepare default recommendations (e.g., top-rated)
# Make sure 'Rating' column exists
if 'Rating' in DF.columns:
default_recommendations_df = DF.sort_values('Rating', ascending=False).copy()
else:
logger.warning("'Rating' column not found in DF. Cannot sort for default recommendations. Using unsorted DF.")
default_recommendations_df = DF.copy() # Fallback to unsorted
default_recs_list = []
frontend_keys = [
'id', 'name', 'toppings', 'price', 'slices', 'serving_size', 'rating', 'rating_count',
'description', 'popular_group', 'dietary_category', 'spice_level', 'sauce_type',
'cheese_amount', 'calories', 'allergens', 'prep_time', 'restaurant', 'seasonal',
'bread_type', 'image_url', 'crust_type'
]
df_to_frontend_map = {
'id': None, 'name': 'Pizza_Name', 'toppings': 'Toppings', 'price': 'Price_Rs', 'slices': 'Slices',
'serving_size': 'Serving_Size', 'rating': 'Rating', 'rating_count': 'Rating_Count',
'description': 'Description', 'popular_group': 'Popular_Group',
'dietary_category': 'Dietary_Category', 'spice_level': 'Spice_Level',
'sauce_type': 'Sauce_Type', 'cheese_amount': 'Cheese_Amount',
'calories': 'Calories_per_Slice', 'allergens': 'Allergens',
'prep_time': 'Preparation_Time_min', 'restaurant': 'Restaurant_Chain',
'seasonal': 'Seasonal_Availability', 'bread_type': 'Bread_Type',
'image_url': 'Image_Url', 'crust_type': CRUST_TYPE_COL # Uses the determined CRUST_TYPE_COL
}
for original_idx, pizza_row in default_recommendations_df.iterrows():
rec_item = {}
for key in frontend_keys:
df_col = df_to_frontend_map.get(key)
if key == 'id':
rec_item[key] = int(original_idx) # Pizza ID is its original index in DF
elif df_col and df_col in pizza_row: # df_col can be None for 'id' or if CRUST_TYPE_COL is None
value = pizza_row[df_col]
# Type conversions for JSON serializability
if isinstance(value, np.integer): value = int(value)
elif isinstance(value, np.floating): value = float(value)
elif isinstance(value, np.ndarray): value = value.tolist()
rec_item[key] = "" if pd.isna(value) else value
elif key == 'crust_type' and not CRUST_TYPE_COL : # If CRUST_TYPE_COL was not found
rec_item[key] = "N/A"
else:
rec_item[key] = "" # Default for missing fields
rec_item['rating_count'] = int(rec_item.get('rating_count', 0) or 0) # Ensure int
rec_item['image_url'] = rec_item.get('image_url') if rec_item.get('image_url') else DEFAULT_IMAGE_URL
# Final pass to convert any remaining numpy generic types
for k_final, v_final in rec_item.items():
if isinstance(v_final, np.generic): rec_item[k_final] = v_final.item()
default_recs_list.append(rec_item)
current_app.logger.info(f"Serving {len(default_recs_list)} pizzas for initial display.")
current_app.logger.info(f"Filter options for template: {filter_options}")
current_app.logger.info(f"ALL_TOPPINGS for template: {ALL_TOPPINGS[:5] if ALL_TOPPINGS else 'None'}")
return render_template('index.html',
toppings=ALL_TOPPINGS,
filter_options=filter_options,
default_recommendations=default_recs_list,
default_image_url=DEFAULT_IMAGE_URL)
def get_recommendations(preferences):
global DF, FEATURE_DF, SCALER, CRUST_TYPE_COL, DEFAULT_IMAGE_URL
if DF is None or FEATURE_DF is None or SCALER is None:
current_app.logger.error("Data not fully initialized (DF, FEATURE_DF, or SCALER is None) for get_recommendations.")
return []
current_indices = DF.index.to_list()
current_app.logger.info(f"Starting with {len(current_indices)} pizzas before filtering. Preferences: {preferences}")
# --- Hard Filters ---
# 1. Toppings
if 'toppings' in preferences and preferences['toppings'] and 'Toppings_list_internal' in DF.columns:
selected_toppings = set(preferences['toppings'])
if selected_toppings: # Ensure not an empty list that would select nothing
topping_mask = DF.loc[current_indices, 'Toppings_list_internal'].apply(
lambda x_toppings: isinstance(x_toppings, list) and any(t in selected_toppings for t in x_toppings)
)
current_indices = DF.loc[current_indices][topping_mask].index.to_list()
current_app.logger.info(f"After toppings filter: {len(current_indices)} pizzas remaining")
if not current_indices: return []
# 2. Max Price
if 'price_range' in preferences and preferences['price_range'] and 'Price_Rs' in DF.columns:
try:
min_price = float(preferences['price_range'][0])
max_price = float(preferences['price_range'][1])
price_mask = (DF.loc[current_indices, 'Price_Rs'] >= min_price) & \
(DF.loc[current_indices, 'Price_Rs'] <= max_price)
current_indices = DF.loc[current_indices][price_mask].index.to_list()
current_app.logger.info(f"After price filter ({min_price}-{max_price}): {len(current_indices)} pizzas")
if not current_indices: return []
except (TypeError, ValueError, IndexError) as e:
current_app.logger.warning(f"Invalid price_range preference: {preferences['price_range']}. Error: {e}")
# 3. Number of Slices (Min Slices)
if 'slices' in preferences and preferences['slices'] is not None and 'Slices' in DF.columns:
try:
min_slices = int(preferences['slices'])
slices_mask = DF.loc[current_indices, 'Slices'] >= min_slices
current_indices = DF.loc[current_indices][slices_mask].index.to_list()
current_app.logger.info(f"After slices filter (>= {min_slices}): {len(current_indices)} pizzas")
if not current_indices: return []
except ValueError:
current_app.logger.warning(f"Invalid value for slices: {preferences['slices']}")
# 4. Minimum Rating
if 'rating' in preferences and preferences['rating'] is not None and 'Rating' in DF.columns:
try:
min_rating = float(preferences['rating'])
rating_mask = DF.loc[current_indices, 'Rating'] >= min_rating
current_indices = DF.loc[current_indices][rating_mask].index.to_list()
current_app.logger.info(f"After rating filter (>= {min_rating}): {len(current_indices)} pizzas")
if not current_indices: return []
except ValueError:
current_app.logger.warning(f"Invalid value for rating: {preferences['rating']}")
# 5. Max Preparation Time
if 'prep_time' in preferences and preferences['prep_time'] is not None and 'Preparation_Time_min' in DF.columns:
try:
max_prep_time = int(str(preferences['prep_time']).lower().replace("min", "").strip())
prep_mask = DF.loc[current_indices, 'Preparation_Time_min'] <= max_prep_time
current_indices = DF.loc[current_indices][prep_mask].index.to_list()
current_app.logger.info(f"After prep time filter (<= {max_prep_time}): {len(current_indices)} pizzas")
if not current_indices: return []
except ValueError:
current_app.logger.warning(f"Could not parse prep_time value: {preferences['prep_time']}")
# 6. Categorical Filters (Multi-select OR logic)
# JS keys: servingsize, populargroup, dietarycategory, spicelevel, saucetype, etc.
categorical_pref_map = {
"servingsize": "Serving_Size", "populargroup": "Popular_Group",
"dietarycategory": "Dietary_Category", "spicelevel": "Spice_Level",
"saucetype": "Sauce_Type", "cheeseamount": "Cheese_Amount",
"restaurantchain": "Restaurant_Chain", "seasonalavailability": "Seasonal_Availability",
"breadtype": "Bread_Type", "crusttype": CRUST_TYPE_COL
}
for pref_key, df_col_name in categorical_pref_map.items():
if df_col_name and pref_key in preferences and preferences[pref_key]: # Ensure df_col_name is not None
pref_value_list = preferences[pref_key] # Expected to be a list from JS
if isinstance(pref_value_list, list) and pref_value_list: # If list is not empty
if df_col_name in DF.columns:
cat_mask = DF.loc[current_indices, df_col_name].isin(pref_value_list)
current_indices = DF.loc[current_indices][cat_mask].index.to_list()
current_app.logger.info(f"After {pref_key} filter (isin {pref_value_list}): {len(current_indices)} pizzas")
if not current_indices: return []
else:
current_app.logger.warning(f"Column '{df_col_name}' for preference '{pref_key}' not found in DF. Filter skipped.")
# If pref_value_list is empty, it means "Any" for this category, so no filtering.
if not current_indices:
current_app.logger.info("No pizzas match all hard filter criteria.")
return []
# --- Similarity Scoring Part ---
# Filter FEATURE_DF to only include pizzas remaining after hard filters
valid_indices_for_feature_df = FEATURE_DF.index.intersection(current_indices)
if valid_indices_for_feature_df.empty:
current_app.logger.info("No valid indices remain for FEATURE_DF after hard filters.")
return []
filtered_feature_df = FEATURE_DF.loc[valid_indices_for_feature_df]
if filtered_feature_df.empty: # Should not happen if valid_indices_for_feature_df is not empty
current_app.logger.warning("Filtered FEATURE_DF is empty. This is unexpected.")
return []
# Create User Preference Vector (aligned with FEATURE_DF columns)
user_vector = pd.Series(0.0, index=FEATURE_DF.columns) # Initialize with 0.0 for float consistency
# 1. Toppings in User Vector
if 'toppings' in preferences and preferences['toppings']:
for topping in preferences['toppings']:
col_name = f"Topping_{topping}"
if col_name in user_vector.index:
user_vector[col_name] = 1.0
# 2. Categorical Preferences (One-Hot) in User Vector
# js_to_df_key_map_for_vector is same as categorical_pref_map but df_col_name is for one-hot prefix
for pref_key, df_col_prefix in categorical_pref_map.items():
if df_col_prefix and pref_key in preferences and preferences[pref_key]: # df_col_prefix can be None for CRUST_TYPE_COL
selected_values = preferences[pref_key] # This is a list
for val_item in selected_values:
# Construct the one-hot encoded column name (e.g., "Spice_Level_Mild")
one_hot_col_name = f"{df_col_prefix}_{val_item}"
if one_hot_col_name in user_vector.index:
user_vector[one_hot_col_name] = 1.0
# 3. Numerical Preferences in User Vector
raw_user_num_prefs_dict = {}
spice_map_for_num_pref = {'Mild': 1.0, 'Medium': 2.0, 'Hot': 3.0} # Use floats
if 'price_range' in preferences and preferences['price_range']:
try: # Average of min/max price for preference
raw_user_num_prefs_dict['Price'] = (float(preferences['price_range'][0]) + float(preferences['price_range'][1])) / 2
except: pass # Ignore if parsing fails
if 'slices' in preferences and preferences['slices'] is not None:
try: raw_user_num_prefs_dict['Slices'] = float(preferences['slices'])
except: pass
if 'rating' in preferences and preferences['rating'] is not None:
try: raw_user_num_prefs_dict['Rating'] = float(preferences['rating'])
except: pass
if 'prep_time' in preferences and preferences['prep_time'] is not None:
try: raw_user_num_prefs_dict['Preparation_Time'] = float(str(preferences['prep_time']).lower().replace("min","").strip())
except: pass
# Numerical Spice_Level: Only if *one* spice level is selected, use its mapped value.
# Otherwise, rely on the one-hot encoded spice level features.
if 'spicelevel' in preferences and isinstance(preferences['spicelevel'], list) and len(preferences['spicelevel']) == 1:
selected_spice = preferences['spicelevel'][0]
if selected_spice in spice_map_for_num_pref:
raw_user_num_prefs_dict['Spice_Level'] = spice_map_for_num_pref[selected_spice]
# Scale these raw numerical preferences using the SCALER
# Create a temporary DataFrame for scaling, ensuring all NUMERICAL_COLS are present
temp_scaling_df = pd.DataFrame(columns=NUMERICAL_COLS, index=[0])
for col in NUMERICAL_COLS:
# Default to the column's mean from FEATURE_DF if user didn't specify,
# or 0 if that's also not available (shouldn't happen if SCALER is fit)
# SCALER.data_min_ / SCALER.data_max_ or SCALER.mean_ could be used if available
default_val = 0.0
if hasattr(SCALER, 'data_min_') and col in FEATURE_DF.columns: # Check if scaler is fit and col exists
# Use the minimum of the scaled range as a neutral default if user didn't specify
col_idx_in_scaler = -1
try: col_idx_in_scaler = NUMERICAL_COLS.index(col)
except ValueError: pass
if col_idx_in_scaler != -1 and col_idx_in_scaler < len(SCALER.data_min_):
default_val = SCALER.data_min_[col_idx_in_scaler] # This is the original min, not scaled min (0)
else: # Fallback if col not in NUMERICAL_COLS used for SCALER fitting
logger.warning(f"Column {col} not found in SCALER's fitted columns during user vector creation. Defaulting to 0.")
temp_scaling_df.loc[0, col] = raw_user_num_prefs_dict.get(col, default_val)
if hasattr(SCALER, 'n_features_in_') : # Check if scaler has been fit
scaled_user_num_values = SCALER.transform(temp_scaling_df[NUMERICAL_COLS])[0]
for i, col_name in enumerate(NUMERICAL_COLS):
if col_name in raw_user_num_prefs_dict: # Only update user_vector if user specified this preference
user_vector[col_name] = scaled_user_num_values[i]
else:
logger.warning("SCALER is not fit. Cannot scale user's numerical preferences. Using raw values (0-1 range assumed).")
for col_name in NUMERICAL_COLS:
if col_name in raw_user_num_prefs_dict:
# Attempt a rough normalization if scaler is not fit, assuming values are in a reasonable range
# This is a fallback and might not be accurate.
user_vector[col_name] = raw_user_num_prefs_dict[col_name] / 100.0 # Example, needs domain knowledge
# Calculate Cosine Similarities
feature_matrix_filtered = filtered_feature_df.values
user_array = user_vector.values.reshape(1, -1)
# Ensure shapes match if FEATURE_DF columns changed dynamically (should not happen with current setup)
if user_array.shape[1] != feature_matrix_filtered.shape[1]:
current_app.logger.error(
f"Shape mismatch! User vector: {user_array.shape}, Feature matrix: {feature_matrix_filtered.shape}. "
f"User cols: {user_vector.index.tolist()[:5]}, Feature cols: {filtered_feature_df.columns.tolist()[:5]}"
)
# Attempt to align columns as a robust measure, though this indicates a deeper issue if it occurs.
common_cols = filtered_feature_df.columns.intersection(user_vector.index)
aligned_user_vector = pd.Series(0.0, index=filtered_feature_df.columns)
aligned_user_vector[common_cols] = user_vector[common_cols]
user_array = aligned_user_vector.values.reshape(1, -1)
if user_array.shape[1] != feature_matrix_filtered.shape[1]:
current_app.logger.critical(f"Persistent shape mismatch even after alignment. Cannot compute similarity.")
return []
similarities = cosine_similarity(user_array, feature_matrix_filtered)[0]
# Get indices sorted by similarity (descending) from the filtered_feature_df
sorted_indices_in_filtered_df = similarities.argsort()[::-1]
# Map these sorted indices back to original DF indices
final_recommendation_indices = valid_indices_for_feature_df[sorted_indices_in_filtered_df]
# Prepare list of recommendations
recommendations_list = []
# frontend_keys and df_to_frontend_map are defined in index_route, can be reused or redefined here
# For safety, redefine here or pass as argument if refactoring
frontend_keys_rec = [
'id', 'name', 'toppings', 'price', 'slices', 'serving_size', 'rating', 'rating_count',
'description', 'popular_group', 'dietary_category', 'spice_level', 'sauce_type',
'cheese_amount', 'calories', 'allergens', 'prep_time', 'restaurant', 'seasonal',
'bread_type', 'image_url', 'crust_type'
]
df_to_frontend_map_rec = {
'id': None, 'name': 'Pizza_Name', 'toppings': 'Toppings', 'price': 'Price_Rs', 'slices': 'Slices',
'serving_size': 'Serving_Size', 'rating': 'Rating', 'rating_count': 'Rating_Count',
'description': 'Description', 'popular_group': 'Popular_Group',
'dietary_category': 'Dietary_Category', 'spice_level': 'Spice_Level',
'sauce_type': 'Sauce_Type', 'cheese_amount': 'Cheese_Amount',
'calories': 'Calories_per_Slice', 'allergens': 'Allergens',
'prep_time': 'Preparation_Time_min', 'restaurant': 'Restaurant_Chain',
'seasonal': 'Seasonal_Availability', 'bread_type': 'Bread_Type',
'image_url': 'Image_Url', 'crust_type': CRUST_TYPE_COL
}
for original_idx in final_recommendation_indices:
pizza_series = DF.iloc[original_idx]
rec_item = {}
for key in frontend_keys_rec:
df_col = df_to_frontend_map_rec.get(key)
if key == 'id':
rec_item[key] = int(original_idx)
elif df_col and df_col in pizza_series:
value = pizza_series[df_col]
if isinstance(value, np.integer): value = int(value)
elif isinstance(value, np.floating): value = float(value)
elif isinstance(value, np.ndarray): value = value.tolist()
rec_item[key] = "" if pd.isna(value) else value
elif key == 'crust_type' and not CRUST_TYPE_COL :
rec_item[key] = "N/A"
else:
rec_item[key] = ""
rec_item['rating_count'] = int(rec_item.get('rating_count', 0) or 0)
rec_item['image_url'] = rec_item.get('image_url') if rec_item.get('image_url') else DEFAULT_IMAGE_URL
for k_final, v_final in rec_item.items(): # Final numpy type check
if isinstance(v_final, np.generic): rec_item[k_final] = v_final.item()
recommendations_list.append(rec_item)
current_app.logger.info(f"Final recommendations count: {len(recommendations_list)}")
return recommendations_list
@app.route('/recommend', methods=['POST'])
def recommend():
try:
data = request.json
preferences = {} # Store processed preferences
current_app.logger.info(f"Received recommendation request with data: {data}")
# Numerical/Range preferences from JS
# Keys in `data` should match JS: 'slices', 'rating', 'prep_time', 'price_range'
simple_numerical_prefs_js = ['slices', 'rating', 'prep_time']
for key_js in simple_numerical_prefs_js:
if key_js in data and data[key_js] is not None:
try:
if key_js == 'rating': preferences[key_js] = float(data[key_js])
else: preferences[key_js] = int(data[key_js]) # slices, prep_time
except ValueError:
current_app.logger.warning(f"Could not parse numerical preference '{key_js}': {data[key_js]}")
if 'price_range' in data and data['price_range']:
try:
preferences['price_range'] = [float(p) for p in data['price_range']]
except (ValueError, TypeError):
current_app.logger.warning(f"Could not parse price_range: {data['price_range']}")
# Multi-select categorical preferences from JS
# Keys in `data` should match JS: 'toppings', 'servingsize', 'dietarycategory', etc.
multi_select_prefs_js = [
'toppings', 'servingsize', 'populargroup', 'dietarycategory',
'spicelevel', 'saucetype', 'cheeseamount', 'restaurantchain',
'seasonalavailability', 'breadtype', 'crusttype'
]
for key_js in multi_select_prefs_js:
if key_js in data and isinstance(data[key_js], list):
preferences[key_js] = data[key_js] # Expecting a list (can be empty for "Any")
elif key_js in data: # If not a list, log warning
current_app.logger.warning(f"Preference for '{key_js}' was not a list: {data[key_js]}. Treating as empty (Any).")
preferences[key_js] = [] # Default to empty list if not a list
current_app.logger.info(f"Processed preferences for filtering: {preferences}")
recommendations = get_recommendations(preferences)
current_app.logger.info(f"Returning {len(recommendations)} recommendations after filtering and scoring.")
return jsonify(recommendations)
except Exception as e:
current_app.logger.error(f"Error in /recommend endpoint: {e}", exc_info=True)
return jsonify({"error": "Failed to get recommendations due to a server issue.", "details": str(e)}), 500
# --- Main Application Execution ---
# Call preprocess_data() at the module level.
# This ensures it runs once when the application (or each Gunicorn worker) starts.
try:
logger.info("----- Starting data preprocessing at module load... -----")
preprocess_data() # Use default 'pizza.csv'
logger.info("----- Data preprocessing completed successfully at module load. -----")
if DF is None:
logger.critical("CRITICAL AT STARTUP: Global DF is None after preprocess_data(). App will likely fail.")
if FEATURE_DF is None:
logger.critical("CRITICAL AT STARTUP: Global FEATURE_DF is None after preprocess_data(). App will likely fail.")
if SCALER is None: # SCALER should be initialized even if fitting fails
logger.critical("CRITICAL AT STARTUP: Global SCALER is None after preprocess_data(). App will likely fail.")
except FileNotFoundError as e:
logger.critical(f"CRITICAL ERROR AT MODULE LOAD (FileNotFoundError): {e}. Ensure 'pizza.csv' is in the /app directory (or same dir as app.py).")
# In a production Gunicorn setup, the app might still try to start, leading to errors in routes.
# For Hugging Face, it's better to log and let it attempt to run, as exiting might obscure logs.
except Exception as e:
logger.critical(f"Unexpected critical startup error during preprocessing at module load: {e}", exc_info=True)
if __name__ == '__main__':
# This block is primarily for local development using `python app.py`.
# preprocess_data() is already called above when the module is imported by Python interpreter.
logger.info("----- Running Flask app directly (e.g., python app.py) -----")
# Sanity check for local run, though globals should be set by the module-level call.
if DF is None or FEATURE_DF is None or SCALER is None:
logger.warning("One or more global data variables (DF, FEATURE_DF, SCALER) are None before local app.run(). This is unexpected if module-level preprocessing ran.")
# Optionally, re-run preprocessing if critical for local dev and something went wrong with module-level load
# logger.info("Attempting to re-run preprocess_data() for local development.")
# preprocess_data()
app.run(debug=True, host='0.0.0.0', port=7860, use_reloader=False)
# use_reloader=False is generally better when you have global state initialized at module level.
# If True, it might re-initialize globals on each reload, which can be slow.