cc-module-python / data_processing.py
LioD19's picture
Upload 5 files
f4a61c0 verified
Raw
History Blame Contribute Delete
2.93 kB
import pandas as pd
import numpy as np
import os
import streamlit as st
@st.cache_data
def load_sample_data():
"""
Load a sample of the housing data for statistics display
"""
try:
# Try to load from the provided CSV path
if os.path.exists("House-Data.csv"):
return pd.read_csv("House-Data.csv")
else:
return None
except Exception as e:
st.warning(f"Could not load sample data: {e}")
return None
def preprocess_inputs(input_dict):
"""
Preprocess the input dictionary to match the format expected by the model
Args:
input_dict (dict): Dictionary containing the input features
Returns:
pd.DataFrame: Processed dataframe ready for prediction
"""
# Load sample data to get feature means for filling missing values
sample_data = load_sample_data()
feature_means = {}
if sample_data is not None:
# Calculate means for numerical features to use as defaults
for col in sample_data.select_dtypes(include=['int64', 'float64']).columns:
if col not in ['id', 'price']:
feature_means[col] = sample_data[col].mean()
# Filter out None values and replace with means from dataset
filtered_dict = {}
for key, value in input_dict.items():
if value is not None:
filtered_dict[key] = value
elif key in feature_means:
# Use mean from dataset if available
filtered_dict[key] = feature_means.get(key, 0)
else:
# Default fallback values if no mean is available
defaults = {
'bedrooms': 3,
'bathrooms': 2.0,
'sqft_living': 1500,
'sqft_lot': 5000,
'floors': 1.0,
'waterfront': 0,
'view': 0,
'condition': 3,
'grade': 7,
'sqft_above': 1000,
'sqft_basement': 0,
'yr_built': 1980,
'yr_renovated': 0,
'zipcode': 98000,
'lat': 47.5,
'long': -122.0,
'sqft_living15': 1500,
'sqft_lot15': 5000
}
filtered_dict[key] = defaults.get(key, 0)
# Convert to dataframe
input_df = pd.DataFrame([filtered_dict])
# Remove 'date' column if it exists (as it's not needed for prediction)
if 'date' in input_df.columns:
input_df = input_df.drop('date', axis=1)
# Remove 'id' column if it exists
if 'id' in input_df.columns:
input_df = input_df.drop('id', axis=1)
# Ensure all numeric columns are float
for col in input_df.columns:
input_df[col] = input_df[col].astype(float)
return input_df