import pandas as pd
import numpy as np
import os
import streamlit as st

@st.cache_data
def load_sample_data():
    """
    Load a sample of the housing data for statistics display
    """
    try:
        # Try to load from the provided CSV path
        if os.path.exists("House-Data.csv"):
            return pd.read_csv("House-Data.csv")
        else:
            return None
    except Exception as e:
        st.warning(f"Could not load sample data: {e}")
        return None

def preprocess_inputs(input_dict):
    """
    Preprocess the input dictionary to match the format expected by the model
    
    Args:
        input_dict (dict): Dictionary containing the input features
        
    Returns:
        pd.DataFrame: Processed dataframe ready for prediction
    """
    # Load sample data to get feature means for filling missing values
    sample_data = load_sample_data()
    feature_means = {}
    
    if sample_data is not None:
        # Calculate means for numerical features to use as defaults
        for col in sample_data.select_dtypes(include=['int64', 'float64']).columns:
            if col not in ['id', 'price']:
                feature_means[col] = sample_data[col].mean()
    
    # Filter out None values and replace with means from dataset
    filtered_dict = {}
    for key, value in input_dict.items():
        if value is not None:
            filtered_dict[key] = value
        elif key in feature_means:
            # Use mean from dataset if available
            filtered_dict[key] = feature_means.get(key, 0)
        else:
            # Default fallback values if no mean is available
            defaults = {
                'bedrooms': 3,
                'bathrooms': 2.0,
                'sqft_living': 1500,
                'sqft_lot': 5000,
                'floors': 1.0,
                'waterfront': 0,
                'view': 0,
                'condition': 3,
                'grade': 7,
                'sqft_above': 1000,
                'sqft_basement': 0,
                'yr_built': 1980,
                'yr_renovated': 0,
                'zipcode': 98000,
                'lat': 47.5,
                'long': -122.0,
                'sqft_living15': 1500,
                'sqft_lot15': 5000
            }
            filtered_dict[key] = defaults.get(key, 0)
    
    # Convert to dataframe
    input_df = pd.DataFrame([filtered_dict])
    
    # Remove 'date' column if it exists (as it's not needed for prediction)
    if 'date' in input_df.columns:
        input_df = input_df.drop('date', axis=1)
    
    # Remove 'id' column if it exists
    if 'id' in input_df.columns:
        input_df = input_df.drop('id', axis=1)
        
    # Ensure all numeric columns are float
    for col in input_df.columns:
        input_df[col] = input_df[col].astype(float)
    
    return input_df