Spaces:

LioD19
/

module-python-pour-AI-CC

Sleeping

App Files Files Community

module-python-pour-AI-CC / src /data_processing.py

LioD19

Update src/data_processing.py

0f47e95 verified about 1 year ago

Raw

History Blame Contribute Delete

3.16 kB

	import pandas as pd
	import numpy as np
	import os
	import streamlit as st

	@st.cache_data
	def load_sample_data():
	"""
	Load a sample of the housing data for statistics display
	"""
	try:
	# Attempt to load from different possible locations
	possible_paths = [
	os.path.join(os.path.dirname(__file__), "House-Data.csv")
	]

	for path in possible_paths:
	if os.path.exists(path):
	return pd.read_csv(path)

	# If no file found, show warning but continue
	st.warning("Fichier de données d'exemple non trouvé. Certaines statistiques peuvent ne pas être disponibles.")
	return None
	except Exception as e:
	st.warning(f"Could not load sample data: {e}")
	return None

	def preprocess_inputs(input_dict):
	"""
	Preprocess the input dictionary to match the format expected by the model

	Args:
	input_dict (dict): Dictionary containing the input features

	Returns:
	pd.DataFrame: Processed dataframe ready for prediction
	"""
	# Load sample data to get feature means for filling missing values
	sample_data = load_sample_data()
	feature_means = {}

	if sample_data is not None:
	# Calculate means for numerical features to use as defaults
	for col in sample_data.select_dtypes(include=['int64', 'float64']).columns:
	if col not in ['id', 'price']:
	feature_means[col] = sample_data[col].mean()

	# Filter out None values and replace with means from dataset
	filtered_dict = {}
	for key, value in input_dict.items():
	if value is not None:
	filtered_dict[key] = value
	elif key in feature_means:
	# Use mean from dataset if available
	filtered_dict[key] = feature_means.get(key, 0)
	else:
	# Default fallback values if no mean is available
	defaults = {
	'bedrooms': 3,
	'bathrooms': 2.0,
	'sqft_living': 1500,
	'sqft_lot': 5000,
	'floors': 1.0,
	'waterfront': 0,
	'view': 0,
	'condition': 3,
	'grade': 7,
	'sqft_above': 1000,
	'sqft_basement': 0,
	'yr_built': 1980,
	'yr_renovated': 0,
	'zipcode': 98000,
	'lat': 47.5,
	'long': -122.0,
	'sqft_living15': 1500,
	'sqft_lot15': 5000
	}
	filtered_dict[key] = defaults.get(key, 0)

	# Convert to dataframe
	input_df = pd.DataFrame([filtered_dict])

	# Remove 'date' column if it exists (as it's not needed for prediction)
	if 'date' in input_df.columns:
	input_df = input_df.drop('date', axis=1)

	# Remove 'id' column if it exists
	if 'id' in input_df.columns:
	input_df = input_df.drop('id', axis=1)

	# Ensure all numeric columns are float
	for col in input_df.columns:
	input_df[col] = input_df[col].astype(float)

	return input_df