| | import streamlit as st |
| | import pandas as pd |
| | import numpy as np |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.preprocessing import StandardScaler |
| | from sklearn.linear_model import LinearRegression |
| | from sklearn.ensemble import RandomForestRegressor |
| | from sklearn.metrics import mean_squared_error, r2_score |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| |
|
| | |
| | def analyze_data(data): |
| | st.write("### Data Analysis") |
| | st.write("**Missing Values:**") |
| | st.write(data.isnull().sum()) |
| | st.write("**Statistical Summary:**") |
| | st.write(data.describe()) |
| |
|
| | |
| | numeric_data = data.select_dtypes(include=['number']) |
| | if not numeric_data.empty: |
| | st.write("**Correlation Matrix:**") |
| | plt.figure(figsize=(10, 8)) |
| | sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0) |
| | st.pyplot(plt) |
| |
|
| |
|
| | def prepare_data(data): |
| | numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns |
| | X = data[numeric_columns[:-1]] |
| | y = data[numeric_columns[-1]] |
| | return X, y |
| |
|
| |
|
| | def preprocess_data(X_train, X_test): |
| | scaler = StandardScaler() |
| | X_train_scaled = scaler.fit_transform(X_train) |
| | X_test_scaled = scaler.transform(X_test) |
| | return X_train_scaled, X_test_scaled, scaler |
| |
|
| |
|
| | def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, feature_names): |
| | models = { |
| | 'Linear Regression': LinearRegression(), |
| | 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42) |
| | } |
| | |
| | results = {} |
| | |
| | for name, model in models.items(): |
| | model.fit(X_train_scaled, y_train) |
| | train_pred = model.predict(X_train_scaled) |
| | test_pred = model.predict(X_test_scaled) |
| |
|
| | results[name] = { |
| | 'model': model, |
| | 'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)), |
| | 'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)), |
| | 'train_r2': r2_score(y_train, train_pred), |
| | 'test_r2': r2_score(y_test, test_pred) |
| | } |
| |
|
| | st.write(f"### {name} Results:") |
| | st.write(f"**Training RMSE:** {results[name]['train_rmse']:.2f}") |
| | st.write(f"**Test RMSE:** {results[name]['test_rmse']:.2f}") |
| | st.write(f"**Training R²:** {results[name]['train_r2']:.3f}") |
| | st.write(f"**Test R²:** {results[name]['test_r2']:.3f}") |
| |
|
| | if name == 'Random Forest': |
| | feature_importance = pd.DataFrame({ |
| | 'Feature': feature_names, |
| | 'Importance': model.feature_importances_ |
| | }).sort_values('Importance', ascending=False) |
| | st.write("**Feature Importance:**") |
| | st.write(feature_importance) |
| |
|
| | plt.figure(figsize=(10, 6)) |
| | sns.barplot(x='Importance', y='Feature', data=feature_importance) |
| | plt.title('Feature Importance') |
| | st.pyplot(plt) |
| |
|
| | return results |
| |
|
| |
|
| | def main(): |
| | st.title("Housing Price Prediction") |
| |
|
| | uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) |
| | if uploaded_file: |
| | data = pd.read_csv(uploaded_file) |
| | st.write("## Dataset Overview") |
| | st.write(data.head()) |
| |
|
| | |
| | analyze_data(data) |
| |
|
| | |
| | X, y = prepare_data(data) |
| |
|
| | |
| | test_size = st.slider("Test data size:", 0.1, 0.5, 0.2) |
| | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) |
| |
|
| | |
| | X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test) |
| |
|
| | |
| | st.write("## Model Training and Evaluation") |
| | train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train.columns) |
| |
|
| |
|
| | |
| | if __name__ == "__main__": |
| | main() |
| |
|
| |
|