module-python-pour-AI-CC / src /data_processing.py
LioD19's picture
Update src/data_processing.py
0f47e95 verified
Raw
History Blame Contribute Delete
3.16 kB
import pandas as pd
import numpy as np
import os
import streamlit as st
@st.cache_data
def load_sample_data():
"""
Load a sample of the housing data for statistics display
"""
try:
# Attempt to load from different possible locations
possible_paths = [
os.path.join(os.path.dirname(__file__), "House-Data.csv")
]
for path in possible_paths:
if os.path.exists(path):
return pd.read_csv(path)
# If no file found, show warning but continue
st.warning("Fichier de données d'exemple non trouvé. Certaines statistiques peuvent ne pas être disponibles.")
return None
except Exception as e:
st.warning(f"Could not load sample data: {e}")
return None
def preprocess_inputs(input_dict):
"""
Preprocess the input dictionary to match the format expected by the model
Args:
input_dict (dict): Dictionary containing the input features
Returns:
pd.DataFrame: Processed dataframe ready for prediction
"""
# Load sample data to get feature means for filling missing values
sample_data = load_sample_data()
feature_means = {}
if sample_data is not None:
# Calculate means for numerical features to use as defaults
for col in sample_data.select_dtypes(include=['int64', 'float64']).columns:
if col not in ['id', 'price']:
feature_means[col] = sample_data[col].mean()
# Filter out None values and replace with means from dataset
filtered_dict = {}
for key, value in input_dict.items():
if value is not None:
filtered_dict[key] = value
elif key in feature_means:
# Use mean from dataset if available
filtered_dict[key] = feature_means.get(key, 0)
else:
# Default fallback values if no mean is available
defaults = {
'bedrooms': 3,
'bathrooms': 2.0,
'sqft_living': 1500,
'sqft_lot': 5000,
'floors': 1.0,
'waterfront': 0,
'view': 0,
'condition': 3,
'grade': 7,
'sqft_above': 1000,
'sqft_basement': 0,
'yr_built': 1980,
'yr_renovated': 0,
'zipcode': 98000,
'lat': 47.5,
'long': -122.0,
'sqft_living15': 1500,
'sqft_lot15': 5000
}
filtered_dict[key] = defaults.get(key, 0)
# Convert to dataframe
input_df = pd.DataFrame([filtered_dict])
# Remove 'date' column if it exists (as it's not needed for prediction)
if 'date' in input_df.columns:
input_df = input_df.drop('date', axis=1)
# Remove 'id' column if it exists
if 'id' in input_df.columns:
input_df = input_df.drop('id', axis=1)
# Ensure all numeric columns are float
for col in input_df.columns:
input_df[col] = input_df[col].astype(float)
return input_df