File size: 3,157 Bytes
8452900
 
 
 
 
 
 
 
 
 
 
a88ac30
8452900
a88ac30
8452900
 
 
 
a88ac30
8452900
a88ac30
 
8452900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
import numpy as np
import os
import streamlit as st

@st.cache_data
def load_sample_data():
    """
    Load a sample of the housing data for statistics display
    """
    try:
        # Attempt to load from different possible locations
        possible_paths = [
            os.path.join(os.path.dirname(__file__), "House-Data.csv")
        ]
        
        for path in possible_paths:
            if os.path.exists(path):
                return pd.read_csv(path)
        
        # If no file found, show warning but continue
        st.warning("Fichier de données d'exemple non trouvé. Certaines statistiques peuvent ne pas être disponibles.")
        return None
    except Exception as e:
        st.warning(f"Could not load sample data: {e}")
        return None

def preprocess_inputs(input_dict):
    """
    Preprocess the input dictionary to match the format expected by the model
    
    Args:
        input_dict (dict): Dictionary containing the input features
        
    Returns:
        pd.DataFrame: Processed dataframe ready for prediction
    """
    # Load sample data to get feature means for filling missing values
    sample_data = load_sample_data()
    feature_means = {}
    
    if sample_data is not None:
        # Calculate means for numerical features to use as defaults
        for col in sample_data.select_dtypes(include=['int64', 'float64']).columns:
            if col not in ['id', 'price']:
                feature_means[col] = sample_data[col].mean()
    
    # Filter out None values and replace with means from dataset
    filtered_dict = {}
    for key, value in input_dict.items():
        if value is not None:
            filtered_dict[key] = value
        elif key in feature_means:
            # Use mean from dataset if available
            filtered_dict[key] = feature_means.get(key, 0)
        else:
            # Default fallback values if no mean is available
            defaults = {
                'bedrooms': 3,
                'bathrooms': 2.0,
                'sqft_living': 1500,
                'sqft_lot': 5000,
                'floors': 1.0,
                'waterfront': 0,
                'view': 0,
                'condition': 3,
                'grade': 7,
                'sqft_above': 1000,
                'sqft_basement': 0,
                'yr_built': 1980,
                'yr_renovated': 0,
                'zipcode': 98000,
                'lat': 47.5,
                'long': -122.0,
                'sqft_living15': 1500,
                'sqft_lot15': 5000
            }
            filtered_dict[key] = defaults.get(key, 0)
    
    # Convert to dataframe
    input_df = pd.DataFrame([filtered_dict])
    
    # Remove 'date' column if it exists (as it's not needed for prediction)
    if 'date' in input_df.columns:
        input_df = input_df.drop('date', axis=1)
    
    # Remove 'id' column if it exists
    if 'id' in input_df.columns:
        input_df = input_df.drop('id', axis=1)
        
    # Ensure all numeric columns are float
    for col in input_df.columns:
        input_df[col] = input_df[col].astype(float)
    
    return input_df