import pandas as pd import numpy as np import os import streamlit as st @st.cache_data def load_sample_data(): """ Load a sample of the housing data for statistics display """ try: # Try to load from the provided CSV path if os.path.exists("House-Data.csv"): return pd.read_csv("House-Data.csv") else: return None except Exception as e: st.warning(f"Could not load sample data: {e}") return None def preprocess_inputs(input_dict): """ Preprocess the input dictionary to match the format expected by the model Args: input_dict (dict): Dictionary containing the input features Returns: pd.DataFrame: Processed dataframe ready for prediction """ # Load sample data to get feature means for filling missing values sample_data = load_sample_data() feature_means = {} if sample_data is not None: # Calculate means for numerical features to use as defaults for col in sample_data.select_dtypes(include=['int64', 'float64']).columns: if col not in ['id', 'price']: feature_means[col] = sample_data[col].mean() # Filter out None values and replace with means from dataset filtered_dict = {} for key, value in input_dict.items(): if value is not None: filtered_dict[key] = value elif key in feature_means: # Use mean from dataset if available filtered_dict[key] = feature_means.get(key, 0) else: # Default fallback values if no mean is available defaults = { 'bedrooms': 3, 'bathrooms': 2.0, 'sqft_living': 1500, 'sqft_lot': 5000, 'floors': 1.0, 'waterfront': 0, 'view': 0, 'condition': 3, 'grade': 7, 'sqft_above': 1000, 'sqft_basement': 0, 'yr_built': 1980, 'yr_renovated': 0, 'zipcode': 98000, 'lat': 47.5, 'long': -122.0, 'sqft_living15': 1500, 'sqft_lot15': 5000 } filtered_dict[key] = defaults.get(key, 0) # Convert to dataframe input_df = pd.DataFrame([filtered_dict]) # Remove 'date' column if it exists (as it's not needed for prediction) if 'date' in input_df.columns: input_df = input_df.drop('date', axis=1) # Remove 'id' column if it exists if 'id' in input_df.columns: input_df = input_df.drop('id', axis=1) # Ensure all numeric columns are float for col in input_df.columns: input_df[col] = input_df[col].astype(float) return input_df