Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import plotly.express as px | |
from scipy import stats | |
import plotly.colors as pc | |
import joblib | |
from io import StringIO | |
import requests | |
import asyncio | |
from io import BytesIO | |
import base64 | |
import tensorflow as tf | |
from tensorflow import keras | |
from tensorflow.keras import layers, callbacks | |
from tensorflow.keras.utils import to_categorical | |
from keras.models import Sequential | |
from keras.layers import Dense | |
import mimetypes | |
import tensorflow | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve | |
from sklearn.linear_model import LinearRegression, LogisticRegression | |
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier | |
from sklearn.svm import SVR, SVC | |
from sklearn.feature_selection import SelectKBest | |
from sklearn.experimental import enable_iterative_imputer | |
from sklearn.impute import IterativeImputer | |
from sklearn.neural_network import MLPRegressor | |
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score | |
from sklearn.impute import KNNImputer, SimpleImputer | |
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from ydata_profiling import ProfileReport | |
from streamlit_pandas_profiling import st_profile_report | |
# Enhanced configuration | |
st.set_page_config( | |
page_title="Executive Insights Pro", | |
layout="wide", | |
page_icon="📈", | |
initial_sidebar_state="expanded" | |
) | |
# Initial session state setup (at the top of your script) | |
if 'raw_data' not in st.session_state: | |
st.session_state.raw_data = None | |
if 'cleaned_data' not in st.session_state: | |
st.session_state.cleaned_data = None | |
# Security: Set allowed file types | |
ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'parquet', 'feather'} | |
MAX_FILE_SIZE_MB = 250 # 250MB limit | |
def validate_file(file): | |
"""Comprehensive file validation""" | |
if not file: | |
return False, "No file uploaded" | |
extension = file.name.split('.')[-1].lower() | |
if extension not in ALLOWED_EXTENSIONS: | |
return False, f"Unsupported file type: {extension}" | |
file_size_mb = file.size / (1024 * 1024) | |
if file_size_mb > MAX_FILE_SIZE_MB: | |
return False, f"File size exceeds {MAX_FILE_SIZE_MB}MB limit" | |
return True, "" | |
def enhanced_quality_report(df): | |
"""Generate comprehensive data quality report""" | |
report = { | |
'basic_stats': { | |
'rows': df.shape[0], | |
'columns': df.shape[1], | |
'missing_values': df.isna().sum().sum(), | |
'duplicates': df.duplicated().sum() | |
}, | |
'column_analysis': {}, | |
'data_health_score': 100 # Starting score | |
} | |
for col in df.columns: | |
col_report = { | |
'type': str(df[col].dtype), | |
'unique': df[col].nunique(), | |
'missing': df[col].isna().sum(), | |
'samples': df[col].dropna().sample(3).tolist() if df[col].dtype == 'object' else [] | |
} | |
# Numeric specific checks | |
if pd.api.types.is_numeric_dtype(df[col]): | |
col_report.update({ | |
'mean': df[col].mean(), | |
'std': df[col].std(), | |
'zeros': (df[col] == 0).sum(), | |
'negatives': (df[col] < 0).sum() if df[col].dtype != 'uint' else 0, | |
'outliers': detect_outliers(df[col]) | |
}) | |
report['data_health_score'] -= 2 # Deduct 2% per numeric column | |
# Categorical specific checks | |
if pd.api.types.is_string_dtype(df[col]): | |
col_report.update({ | |
'top_value': df[col].mode()[0] if not df[col].empty else None, | |
'top_freq': df[col].value_counts().iloc[0]/len(df) if not df[col].empty else 0 | |
}) | |
report['data_health_score'] -= 1 # Deduct 1% per string column | |
report['column_analysis'][col] = col_report | |
report['data_health_score'] = max(report['data_health_score'], 0) | |
return report | |
def detect_outliers(series): | |
"""Detect outliers using IQR method""" | |
q1 = series.quantile(0.25) | |
q3 = series.quantile(0.75) | |
iqr = q3 - q1 | |
return ((series < (q1 - 1.5 * iqr)) | (series > (q3 + 1.5 * iqr))).sum() | |
# Define app_mode for navigation | |
app_mode = st.sidebar.selectbox( | |
"Select Page", | |
["Data Upload", "Smart Cleaning", "Advanced EDA", "Model Training", "Insights", "Predictions", "Neural Network Studio"], | |
help="Choose the section to navigate to." | |
) | |
# --- Data Upload Page --- | |
if app_mode == "Data Upload": | |
st.title("📤 Smart Data Hub") | |
st.markdown(""" | |
**Upload your dataset** (CSV, Excel, Parquet) for comprehensive analysis. | |
Get instant data health insights and quality assessment. | |
""") | |
# File upload with enhanced UI | |
uploaded_file = st.file_uploader( | |
"Drag & drop or browse files", | |
type=list(ALLOWED_EXTENSIONS), | |
help=f"Max file size: {MAX_FILE_SIZE_MB}MB. Supported formats: {', '.join(ALLOWED_EXTENSIONS)}" | |
) | |
if uploaded_file: | |
# Validate file | |
is_valid, message = validate_file(uploaded_file) | |
if not is_valid: | |
st.error(f"Upload error: {message}") | |
st.stop() | |
# Load data with progress | |
with st.spinner(f"Loading {uploaded_file.name}..."): | |
try: | |
if uploaded_file.name.endswith('.csv'): | |
df = pd.read_csv(uploaded_file, low_memory=False) | |
elif uploaded_file.name.endswith(('.xlsx', '.xls')): | |
df = pd.read_excel(uploaded_file) | |
elif uploaded_file.name.endswith('.parquet'): | |
df = pd.read_parquet(uploaded_file) | |
elif uploaded_file.name.endswith('.feather'): | |
df = pd.read_feather(uploaded_file) | |
st.session_state.raw_data = df | |
st.success("Dataset loaded successfully!") | |
except Exception as e: | |
st.error(f"Error loading file: {str(e)}") | |
st.stop() | |
# In your Data Upload section, add this when new data is uploaded | |
if uploaded_file is not None: | |
# Reset models when new data is uploaded | |
st.session_state.model = None | |
st.session_state.preprocessor = None | |
# Data Health Dashboard | |
st.subheader("📊 Data Health Dashboard") | |
report = enhanced_quality_report(df) | |
col1, col2, col3, col4 = st.columns(4) | |
col1.metric("Total Rows", report['basic_stats']['rows']) | |
col2.metric("Total Columns", report['basic_stats']['columns']) | |
col3.metric("Missing Values", report['basic_stats']['missing_values']) | |
col4.metric("Data Health Score", f"{report['data_health_score']}/100") | |
# Column Explorer | |
with st.expander("🔍 Deep Column Analysis", expanded=True): | |
selected_col = st.selectbox("Select column to inspect", df.columns) | |
col_info = report['column_analysis'][selected_col] | |
st.write(f"**Type:** {col_info['type']}") | |
st.write(f"**Unique Values:** {col_info['unique']}") | |
st.write(f"**Missing Values:** {col_info['missing']} ({col_info['missing']/len(df):.1%})") | |
if pd.api.types.is_numeric_dtype(df[selected_col]): | |
st.write("**Distribution:**") | |
st.line_chart(df[selected_col]) | |
st.write(f"**Outliers Detected:** {col_info['outliers']}") | |
else: | |
st.write("**Most Common Values:**") | |
top_values = df[selected_col].value_counts().head(5) | |
st.bar_chart(top_values) | |
# Smart Recommendations | |
with st.expander("💡 Cleaning Recommendations"): | |
recommendations = [] | |
if report['basic_stats']['duplicates'] > 0: | |
recommendations.append(f"🚨 Remove {report['basic_stats']['duplicates']} duplicate rows") | |
if report['basic_stats']['missing_values'] > 0: | |
recommendations.append("🔧 Apply advanced imputation strategies") | |
for col, data in report['column_analysis'].items(): | |
if data['missing'] > 0.5 * len(df): | |
recommendations.append(f"⚠️ Consider dropping {col} (>{50}% missing)") | |
if data['unique'] == len(df): | |
recommendations.append(f"🔍 Investigate {col} - potential unique identifier") | |
if recommendations: | |
st.write("### Recommended Actions") | |
for rec in recommendations[:5]: # Show top 5 | |
st.write(f"- {rec}") | |
else: | |
st.success("No critical issues detected - your data looks healthy!") | |
# Data Preview | |
with st.expander("🔎 Data Preview", expanded=True): | |
preview_size = st.slider("Preview rows", 5, 100, 15) | |
st.dataframe(df.head(preview_size).style.highlight_null(color='#FF6666')) | |
# Advanced Profiling | |
if st.button("🚀 Generate Full Data Profile"): | |
with st.spinner("Generating comprehensive report..."): | |
pr = ProfileReport(df, explorative=True) | |
st_profile_report(pr) | |
elif app_mode == "Smart Cleaning": | |
st.title("🧼 Intelligent Data Cleaning") | |
st.markdown(""" | |
**Automated Data Cleaning** with smart suggestions and advanced transformations. | |
Clean your data with confidence using AI-powered recommendations. | |
""") | |
if 'raw_data' not in st.session_state or st.session_state.raw_data is None: | |
st.warning("Please upload your data in the Data Upload section first.") | |
st.stop() | |
# Initialize versioning | |
if 'data_versions' not in st.session_state: | |
st.session_state.data_versions = [st.session_state.raw_data.copy()] | |
st.session_state.current_version = 0 | |
def update_version(new_df): | |
st.session_state.data_versions = st.session_state.data_versions[:st.session_state.current_version+1] | |
st.session_state.data_versions.append(new_df.copy()) | |
st.session_state.current_version += 1 | |
df = st.session_state.data_versions[st.session_state.current_version].copy() | |
cleaning_actions = st.session_state.get('cleaning_actions', []) | |
# Version Control with Progress Bar | |
with st.expander("⏪ Version Control", expanded=True): | |
st.caption(f"Current Version: {st.session_state.current_version+1}/{len(st.session_state.data_versions)}") | |
progress = (st.session_state.current_version + 1) / len(st.session_state.data_versions) | |
st.progress(progress) | |
col1, col2 = st.columns(2) | |
with col1: | |
if st.button("⏮️ Undo Last Action", disabled=st.session_state.current_version == 0): | |
st.session_state.current_version -= 1 | |
st.experimental_rerun() | |
with col2: | |
if st.button("⏭️ Redo Next Action", disabled=st.session_state.current_version == len(st.session_state.data_versions)-1): | |
st.session_state.current_version += 1 | |
st.experimental_rerun() | |
# Data Health Dashboard with Cards | |
st.subheader("📊 Data Health Dashboard") | |
with st.expander("Show Comprehensive Data Report", expanded=True): | |
from pandas_profiling import ProfileReport | |
pr = ProfileReport(df, explorative=True) | |
st_profile_report(pr) | |
# Enhanced Health Summary with Cards | |
col1, col2, col3, col4 = st.columns(4) | |
with col1: | |
st.metric("Total Rows", len(df), help="Number of rows in the dataset") | |
with col2: | |
st.metric("Total Columns", len(df.columns), help="Number of columns in the dataset") | |
with col3: | |
missing_pct = df.isna().mean().mean() | |
st.metric("Missing Values", f"{missing_pct:.1%}", help="Percentage of missing values in the dataset") | |
with col4: | |
duplicates = df.duplicated().sum() | |
st.metric("Duplicates", duplicates, help="Number of duplicate rows in the dataset") | |
# Visualizations for Data Health | |
st.markdown("### 📈 Data Health Visualizations") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.plotly_chart(px.bar(df.isna().sum(), title="Missing Values per Column", | |
labels={'index': 'Column', 'value': 'Missing Count'}, | |
color=df.isna().sum(), color_continuous_scale="Bluered")) | |
with col2: | |
st.plotly_chart(px.pie(values=df.dtypes.value_counts(), names=df.dtypes.value_counts().index, | |
title="Data Type Distribution", hole=0.3)) | |
# Cleaning Operations with Tabs | |
st.subheader("🔧 Cleaning Operations") | |
tab1, tab2, tab3, tab4 = st.tabs(["Missing Values", "Duplicates", "Data Types", "Outliers"]) | |
# 1. Missing Value Handling | |
with tab1: | |
st.markdown("### 🕳️ Handle Missing Values") | |
missing_cols = df.columns[df.isna().any()].tolist() | |
if missing_cols: | |
st.write("Columns with missing values:") | |
cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols) | |
method = st.radio("Imputation Method", [ | |
"Drop Missing", | |
"Mean/Median/Mode", | |
"KNN Imputation", | |
"MICE Imputation", | |
"Deep Learning Imputation" | |
], horizontal=True) | |
if st.button(f"Apply {method}"): | |
try: | |
original_df = df.copy() | |
# Imputation logic here... | |
cleaning_actions.append(f"Applied {method} on {cols}") | |
update_version(df) | |
st.success(f"{method} applied successfully! ✅") | |
except Exception as e: | |
st.error(f"Error: {str(e)}") | |
else: | |
st.success("✨ No missing values found!") | |
# 2. Duplicate Handling | |
with tab2: | |
st.markdown("### 🔄 Handle Duplicates") | |
duplicates = df.duplicated().sum() | |
if duplicates > 0: | |
st.plotly_chart(px.histogram(df, x=df.duplicated(), title="Duplicate Distribution")) | |
dup_strategy = st.radio("Duplicate Strategy", [ | |
"Remove All Duplicates", | |
"Keep First Occurrence", | |
"Keep Last Occurrence" | |
]) | |
if st.button("Handle Duplicates"): | |
original_count = len(df) | |
df = df.drop_duplicates(keep={ | |
"Remove All Duplicates": False, | |
"Keep First Occurrence": 'first', | |
"Keep Last Occurrence": 'last' | |
}[dup_strategy]) | |
cleaning_actions.append(f"Removed {original_count - len(df)} duplicates") | |
update_version(df) | |
st.success(f"Removed {original_count - len(df)} duplicates! ✅") | |
else: | |
st.success("✨ No duplicates found!") | |
# 3. Data Type Conversion | |
with tab3: | |
st.markdown("### 🔄 Convert Data Types") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.dataframe(df.dtypes.reset_index().rename(columns={0: 'Type', 'index': 'Column'})) | |
with col2: | |
col_to_convert = st.selectbox("Select column to convert", df.columns) | |
new_type = st.selectbox("New Data Type", [ | |
"String", "Integer", "Float", | |
"Boolean", "Datetime", "Category" | |
]) | |
if st.button("Convert Data Type"): | |
try: | |
# Conversion logic here... | |
cleaning_actions.append(f"Converted {col_to_convert} to {new_type}") | |
update_version(df) | |
st.success("Data type converted successfully! ✅") | |
except Exception as e: | |
st.error(f"Conversion failed: {str(e)}") | |
# 4. Outlier Handling | |
with tab4: | |
st.markdown("### 📈 Handle Outliers") | |
numeric_cols = df.select_dtypes(include=np.number).columns.tolist() | |
if numeric_cols: | |
outlier_col = st.selectbox("Select numeric column", numeric_cols) | |
st.plotly_chart(px.box(df, y=outlier_col, title="Outlier Distribution")) | |
if st.button("Remove Outliers"): | |
# Outlier removal logic here... | |
cleaning_actions.append(f"Removed outliers from {outlier_col}") | |
update_version(df) | |
st.success("Outliers removed successfully! ✅") | |
else: | |
st.info("ℹ️ No numeric columns found for outlier detection") | |
# Save Cleaned Data with Enhanced Feedback | |
if st.button("💾 Save Cleaned Data"): | |
st.session_state.cleaned_data = df | |
st.balloons() | |
# Generate comprehensive report | |
from pandas_profiling import ProfileReport | |
pr = ProfileReport(df, title="Cleaned Data Report") | |
st_profile_report(pr) | |
# Show cleaning log with diffs | |
st.subheader("📝 Cleaning Log") | |
st.table(pd.DataFrame({ | |
"Step": range(1, len(cleaning_actions)+1), | |
"Action": cleaning_actions | |
})) | |
# Show dataset comparison | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("Original Data Shape:", st.session_state.raw_data.shape) | |
with col2: | |
st.write("Cleaned Data Shape:", df.shape) | |
st.success("✅ Cleaned data saved successfully! You can now proceed to analysis.") | |
elif app_mode == "Advanced EDA": | |
st.title("🔍 Advanced Exploratory Data Analysis") | |
st.markdown(""" | |
**Interactive Data Exploration** with optimized visualizations for fast insights. | |
Uncover patterns and relationships in your data with beautiful, responsive plots. | |
""") | |
if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None: | |
st.warning("Please clean your data in the Smart Cleaning section first.") | |
st.stop() | |
df = st.session_state.cleaned_data.copy() | |
# Initialize session state for EDA configuration | |
if 'eda_config' not in st.session_state: | |
st.session_state.eda_config = { | |
'plot_type': "Histogram", | |
'x_col': df.columns[0] if len(df.columns) > 0 else None, | |
'y_col': df.columns[1] if len(df.columns) > 1 else None, | |
'z_col': df.columns[2] if len(df.columns) > 2 else None, | |
'color_col': None, | |
'facet_col': None, | |
'hover_data_cols': [], | |
'color_palette': "Viridis", | |
'filter_col': None, | |
'filter_options': [] | |
} | |
# Main Layout Columns | |
col1, col2 = st.columns([1, 3]) | |
with col1: | |
st.header("📊 Visualization Setup") | |
# Plot Type Selection | |
plot_types = { | |
"Distribution": ["Histogram", "Box Plot", "Violin Plot", "Density Plot"], | |
"Relationship": ["Scatter Plot", "Line Plot", "Heatmap", "Pair Plot"], | |
"Comparison": ["Bar Chart", "Pie Chart", "Parallel Coordinates"], | |
"3D": ["3D Scatter", "3D Surface"] | |
} | |
selected_category = st.selectbox("Plot Category", list(plot_types.keys())) | |
st.session_state.eda_config['plot_type'] = st.selectbox( | |
"Plot Type", | |
plot_types[selected_category] | |
) | |
# Dynamic Column Selectors | |
plot_type = st.session_state.eda_config['plot_type'] | |
if plot_type in ["Histogram", "Box Plot", "Violin Plot", "Density Plot", "Bar Chart", "Pie Chart"]: | |
st.session_state.eda_config['x_col'] = st.selectbox( | |
"X Axis", | |
df.columns, | |
index=df.columns.get_loc(st.session_state.eda_config['x_col']) | |
if st.session_state.eda_config['x_col'] in df.columns else 0 | |
) | |
if plot_type in ["Scatter Plot", "Line Plot", "Box Plot", "Violin Plot", "Density Plot"]: | |
st.session_state.eda_config['y_col'] = st.selectbox( | |
"Y Axis", | |
df.columns, | |
index=df.columns.get_loc(st.session_state.eda_config['y_col']) | |
if st.session_state.eda_config['y_col'] in df.columns else 0 | |
) | |
if plot_type in ["3D Scatter", "3D Surface"]: | |
st.session_state.eda_config['z_col'] = st.selectbox( | |
"Z Axis", | |
df.columns, | |
index=df.columns.get_loc(st.session_state.eda_config['z_col']) | |
if st.session_state.eda_config['z_col'] in df.columns else 0 | |
) | |
# Additional Options | |
with st.expander("🎨 Customization"): | |
st.session_state.eda_config['color_col'] = st.selectbox( | |
"Color By", | |
[None] + list(df.columns) | |
) | |
st.session_state.eda_config['facet_col'] = st.selectbox( | |
"Facet By", | |
[None] + list(df.columns) | |
) | |
st.session_state.eda_config['hover_data_cols'] = st.multiselect( | |
"Hover Data", | |
df.columns | |
) | |
st.session_state.eda_config['color_palette'] = st.selectbox( | |
"Color Palette", | |
px.colors.named_colorscales() | |
) | |
# Data Filtering | |
with st.expander("🔎 Data Filtering"): | |
filter_col = st.selectbox( | |
"Filter Column", | |
[None] + list(df.columns) | |
) | |
if filter_col: | |
unique_values = df[filter_col].unique() | |
selected_values = st.multiselect( | |
f"Select {filter_col} values", | |
unique_values, | |
default=unique_values | |
) | |
df = df[df[filter_col].isin(selected_values)] | |
with col2: | |
st.header("📈 Visualization") | |
config = st.session_state.eda_config | |
def generate_plot(df, plot_type, config): | |
"""Cached plot generation function for better performance""" | |
try: | |
if plot_type == "Histogram": | |
return px.histogram( | |
df, x=config['x_col'], | |
color=config['color_col'], | |
nbins=30, | |
color_discrete_sequence=[config['color_palette']] | |
) | |
elif plot_type == "Scatter Plot": | |
return px.scatter( | |
df, x=config['x_col'], y=config['y_col'], | |
color=config['color_col'], | |
hover_data=config['hover_data_cols'] | |
) | |
elif plot_type == "Box Plot": | |
return px.box( | |
df, x=config['x_col'], y=config['y_col'], | |
color=config['color_col'] | |
) | |
elif plot_type == "Violin Plot": | |
return px.violin( | |
df, x=config['x_col'], y=config['y_col'], | |
color=config['color_col'], | |
box=True | |
) | |
elif plot_type == "Heatmap": | |
numeric_df = df.select_dtypes(include=np.number) | |
corr = numeric_df.corr() | |
return px.imshow( | |
corr, | |
text_auto=True, | |
color_continuous_scale=config['color_palette'] | |
) | |
elif plot_type == "3D Scatter": | |
return px.scatter_3d( | |
df, x=config['x_col'], y=config['y_col'], z=config['z_col'], | |
color=config['color_col'] | |
) | |
elif plot_type == "Bar Chart": | |
return px.bar( | |
df, x=config['x_col'], y=config['y_col'], | |
color=config['color_col'] | |
) | |
elif plot_type == "Pie Chart": | |
return px.pie( | |
df, names=config['x_col'], values=config['y_col'], | |
color_discrete_sequence=[config['color_palette']] | |
) | |
elif plot_type == "Line Plot": | |
return px.line( | |
df, x=config['x_col'], y=config['y_col'], | |
color=config['color_col'] | |
) | |
elif plot_type == "Pair Plot": | |
numeric_cols = df.select_dtypes(include=np.number).columns | |
return px.scatter_matrix( | |
df[numeric_cols], | |
color=config['color_col'] | |
) | |
elif plot_type == "Parallel Coordinates": | |
numeric_df = df.select_dtypes(include=np.number) | |
return px.parallel_coordinates( | |
numeric_df, | |
color_continuous_scale=config['color_palette'] | |
) | |
elif plot_type == "Density Plot": | |
return px.density_contour( | |
df, x=config['x_col'], y=config['y_col'], | |
color=config['color_col'] | |
) | |
except Exception as e: | |
st.error(f"Plot generation error: {str(e)}") | |
return None | |
# Generate and display plot | |
fig = generate_plot(df, plot_type, config) | |
if fig: | |
st.plotly_chart(fig, use_container_width=True) | |
# Plot Statistics | |
with st.expander("📊 Plot Statistics"): | |
if plot_type in ["Histogram", "Box Plot", "Violin Plot"]: | |
st.write(f"**{config['x_col']} Statistics**") | |
st.table(df[config['x_col']].describe()) | |
if plot_type in ["Scatter Plot", "Line Plot"]: | |
st.write(f"**Correlation between {config['x_col']} and {config['y_col']}**") | |
corr = df[[config['x_col'], config['y_col']]].corr().iloc[0,1] | |
st.metric("Pearson Correlation", f"{corr:.2f}") | |
if plot_type == "Heatmap": | |
st.write("**Correlation Matrix**") | |
numeric_df = df.select_dtypes(include=np.number) | |
st.dataframe(numeric_df.corr()) | |
# Data Summary Section | |
st.header("📝 Data Summary") | |
with st.expander("Show Data Summary"): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("**Data Shape**") | |
st.write(f"Rows: {df.shape[0]}") | |
st.write(f"Columns: {df.shape[1]}") | |
with col2: | |
st.write("**Data Types**") | |
st.dataframe(df.dtypes.reset_index().rename(columns={ | |
'index': 'Column', 0: 'Type' | |
})) | |
st.write("**Sample Data**") | |
st.dataframe(df.head()) | |
# Model Training Section | |
elif app_mode == "Model Training": | |
st.title("🚂 Model Training Studio") | |
st.markdown(""" | |
**Train and Evaluate Machine Learning Models** with advanced hyperparameter tuning and performance tracking. | |
Choose from a wide range of algorithms and configurations. | |
""") | |
# Initialize session state variables | |
if 'model' not in st.session_state: | |
st.session_state.model = None | |
if 'preprocessor' not in st.session_state: | |
st.session_state.preprocessor = None | |
if 'X_train_selected' not in st.session_state: | |
st.session_state.X_train_selected = None | |
if 'X_test_selected' not in st.session_state: | |
st.session_state.X_test_selected = None | |
if 'y_train' not in st.session_state: | |
st.session_state.y_train = None | |
if 'y_test' not in st.session_state: | |
st.session_state.y_test = None | |
df = st.session_state.cleaned_data.copy() | |
# Target Variable Selection | |
st.subheader("🎯 Target Variable") | |
target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.") | |
# Problem Type Selection | |
st.subheader("📝 Problem Type") | |
problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.") | |
# Feature Selection | |
st.subheader("🔧 Feature Selection") | |
use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.") | |
if use_all_features: | |
feature_columns = df.drop(columns=[target_column]).columns.tolist() | |
else: | |
feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.") | |
# Model Selection | |
st.subheader("🤖 Model Selection") | |
if problem_type == "Regression": | |
model_options = ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network"] | |
else: # Classification | |
model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network", "KNN", "Naive Bayes"] | |
model_name = st.selectbox("Select Model", model_options, help="Choose a model.") | |
# Hyperparameter Tuning | |
st.subheader("🎛️ Hyperparameter Tuning") | |
with st.expander("Configure Hyperparameters", expanded=True): | |
if model_name == "Random Forest": | |
n_estimators = st.slider("Number of Estimators", 10, 200, 100) | |
max_depth = st.slider("Max Depth", 3, 20, 10) | |
min_samples_split = st.slider("Min Samples Split", 2, 10, 2) | |
min_samples_leaf = st.slider("Min Samples Leaf", 1, 10, 1) | |
hyperparams = { | |
'n_estimators': n_estimators, | |
'max_depth': max_depth, | |
'min_samples_split': min_samples_split, | |
'min_samples_leaf': min_samples_leaf | |
} | |
elif model_name == "Gradient Boosting": | |
learning_rate = st.slider("Learning Rate", 0.01, 1.0, 0.1) | |
n_estimators = st.slider("Number of Estimators", 10, 200, 100) | |
max_depth = st.slider("Max Depth", 3, 20, 10) | |
hyperparams = { | |
'learning_rate': learning_rate, | |
'n_estimators': n_estimators, | |
'max_depth': max_depth | |
} | |
elif model_name == "Neural Network": | |
hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2) | |
neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50) | |
epochs = st.slider("Epochs", 10, 200, 50) | |
batch_size = st.slider("Batch Size", 16, 128, 32) | |
hyperparams = { | |
'hidden_layers': hidden_layers, | |
'neurons_per_layer': neurons_per_layer, | |
'epochs': epochs, | |
'batch_size': batch_size | |
} | |
else: | |
hyperparams = {} | |
# Train-Test Split | |
st.subheader("✂️ Train-Test Split") | |
test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.") | |
# Model Training | |
if st.button("🚀 Train Model"): | |
with st.spinner("Training model..."): | |
try: | |
X = df[feature_columns] | |
y = df[target_column] | |
# Check if X is empty | |
if X.empty: | |
st.error("No features were selected. Please select feature columns.") | |
st.stop() | |
# Train-Test Split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) | |
# Preprocessing Pipeline | |
numeric_features = X.select_dtypes(include=np.number).columns | |
categorical_features = X.select_dtypes(exclude=np.number).columns | |
numeric_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='median')), | |
('scaler', StandardScaler()) | |
]) | |
categorical_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='most_frequent')), | |
('onehot', OneHotEncoder(handle_unknown='ignore')) | |
]) | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
('num', numeric_transformer, numeric_features), | |
('cat', categorical_transformer, categorical_features) | |
]) | |
X_train_processed = preprocessor.fit_transform(X_train) | |
X_test_processed = preprocessor.transform(X_test) | |
# Model Training | |
if model_name == "Linear Regression": | |
model = LinearRegression() | |
elif model_name == "Logistic Regression": | |
model = LogisticRegression(max_iter=1000) | |
elif model_name == "Decision Tree": | |
if problem_type == "Regression": | |
model = DecisionTreeRegressor() | |
else: | |
model = DecisionTreeClassifier() | |
elif model_name == "Random Forest": | |
if problem_type == "Regression": | |
model = RandomForestRegressor(**hyperparams) | |
else: | |
model = RandomForestClassifier(**hyperparams) | |
elif model_name == "Gradient Boosting": | |
if problem_type == "Regression": | |
model = GradientBoostingRegressor(**hyperparams) | |
else: | |
model = GradientBoostingClassifier(**hyperparams) | |
elif model_name == "SVM": | |
if problem_type == "Regression": | |
model = SVR() | |
else: | |
model = SVC() | |
elif model_name == "Neural Network": | |
if problem_type == "Regression": | |
model = MLPRegressor( | |
hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'], | |
max_iter=hyperparams['epochs'], | |
batch_size=hyperparams['batch_size'] | |
) | |
else: | |
model = MLPClassifier( | |
hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'], | |
max_iter=hyperparams['epochs'], | |
batch_size=hyperparams['batch_size'] | |
) | |
elif model_name == "KNN": | |
model = KNeighborsClassifier() | |
elif model_name == "Naive Bayes": | |
model = GaussianNB() | |
# Train the model | |
model.fit(X_train_processed, y_train) | |
# Store model and preprocessor | |
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)]) | |
st.session_state.preprocessor = preprocessor | |
# Store the test data | |
st.session_state.X_train_selected = X_train_processed | |
st.session_state.X_test_selected = X_test_processed | |
st.session_state.y_train = y_train | |
st.session_state.y_test = y_test | |
# Model Evaluation | |
y_pred = model.predict(X_test_processed) | |
if problem_type == "Regression": | |
mse = mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
mae = mean_absolute_error(y_test, y_pred) | |
r2 = r2_score(y_test, y_pred) | |
st.write(f"Mean Squared Error: {mse:.4f}") | |
st.write(f"Root Mean Squared Error: {rmse:.4f}") | |
st.write(f"Mean Absolute Error: {mae:.4f}") | |
st.write(f"R-squared: {r2:.4f}") | |
else: | |
accuracy = accuracy_score(y_test, y_pred) | |
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) | |
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) | |
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0) | |
st.write(f"Accuracy: {accuracy:.4f}") | |
st.write(f"Precision: {precision:.4f}") | |
st.write(f"Recall: {recall:.4f}") | |
st.write(f"F1 Score: {f1:.4f}") | |
st.write("Classification Report:") | |
st.text(classification_report(y_test, y_pred)) | |
# Visualization | |
st.subheader("📊 Model Performance Visualization") | |
if problem_type == "Regression": | |
fig, ax = plt.subplots() | |
ax.scatter(y_test, y_pred) | |
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) | |
ax.set_xlabel('Actual') | |
ax.set_ylabel('Predicted') | |
ax.set_title('Actual vs Predicted') | |
st.pyplot(fig) | |
else: | |
conf_matrix = confusion_matrix(y_test, y_pred) | |
fig, ax = plt.subplots() | |
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax) | |
ax.set_xlabel('Predicted Labels') | |
ax.set_ylabel('True Labels') | |
ax.set_title('Confusion Matrix') | |
st.pyplot(fig) | |
st.success("Model trained successfully!") | |
except Exception as e: | |
st.error(f"An error occurred during training: {e}") | |
if st.session_state.model is not None: | |
st.subheader("💾 Save Model") | |
model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model") | |
if st.button("Save Model"): | |
try: | |
joblib.dump(st.session_state.model, f"{model_filename}.joblib") | |
st.success(f"Model saved as {model_filename}.joblib") | |
except Exception as e: | |
st.error(f"Error saving model: {e}") | |
else: | |
st.warning("No trained model available. Train a model first to enable saving.") | |
# Visualization Lab Section | |
elif app_mode == "Visualization Lab": | |
st.title("🔬 Visualization Lab") | |
st.markdown(""" | |
**Explore and Visualize Your Data** with advanced plotting tools and interactive visualizations. | |
Uncover hidden patterns and relationships in your data. | |
""") | |
if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None: | |
st.warning("Please clean your data in the Smart Cleaning section first.") | |
st.stop() | |
df = st.session_state.cleaned_data.copy() | |
# Visualization Type Selection | |
st.subheader("📊 Choose Visualization Type") | |
plot_types = [ | |
"Histogram", "Scatter Plot", "Box Plot", "Violin Plot", | |
"Correlation Heatmap", "Parallel Coordinates", "Andrews Curves", | |
"Pair Plot", "Density Contour", "3D Scatter", "Time Series", | |
"Sunburst Chart", "Funnel Chart", "Clustering Analysis" | |
] | |
plot_type = st.selectbox("Select Visualization Type", plot_types) | |
# Dynamic Controls Based on Plot Type | |
if plot_type != "Correlation Heatmap": | |
x_col = st.selectbox("X Axis", df.columns) | |
if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]: | |
y_col = st.selectbox("Y Axis", df.columns) | |
if plot_type == "3D Scatter": | |
z_col = st.selectbox("Z Axis", df.columns) | |
color_col = st.selectbox("Color by", [None] + list(df.columns)) | |
# Advanced Plot Customization | |
with st.expander("🎨 Advanced Customization", expanded=False): | |
color_palette = st.selectbox("Color Palette", ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]) | |
hover_data_cols = st.multiselect("Hover Data", df.columns) | |
# Plot Generation | |
try: | |
fig = None | |
if plot_type == "Histogram": | |
fig = px.histogram( | |
df, x=x_col, y=y_col, | |
nbins=30, template="plotly_dark", | |
color_discrete_sequence=[color_palette] | |
) | |
elif plot_type == "Scatter Plot": | |
fig = px.scatter( | |
df, x=x_col, y=y_col, | |
color=color_col, | |
size=hover_data_cols, | |
hover_data=hover_data_cols | |
) | |
elif plot_type == "3D Scatter": | |
fig = px.scatter_3d( | |
df, x=x_col, y=y_col, z=z_col, | |
color=color_col, | |
color_discrete_sequence=[color_palette] | |
) | |
elif plot_type == "Correlation Heatmap": | |
numeric_df = df.select_dtypes(include=np.number) | |
if not numeric_df.empty: | |
corr = numeric_df.corr() | |
fig = px.imshow( | |
corr, text_auto=True, | |
color_continuous_scale=color_palette | |
) | |
else: | |
st.warning("No numerical columns found for correlation heatmap.") | |
elif plot_type == "Box Plot": | |
fig = px.box( | |
df, x=x_col, y=y_col, | |
color=color_col | |
) | |
elif plot_type == "Violin Plot": | |
fig = px.violin( | |
df, x=x_col, y=y_col, | |
box=True, points="all", | |
color=color_col | |
) | |
elif plot_type == "Time Series": | |
df = df.sort_values(by=x_col) | |
fig = px.line( | |
df, x=x_col, y=y_col, | |
color=color_col | |
) | |
elif plot_type == "Scatter Matrix": | |
fig = px.scatter_matrix( | |
df, dimensions=[x_col, y_col], | |
color=color_col | |
) | |
if fig: | |
st.plotly_chart(fig, use_container_width=True) | |
except Exception as e: | |
st.error(f"An error occurred while generating the plot: {e}") | |
# Statistical Analysis Section | |
with st.expander("📊 Statistical Analysis", expanded=True): | |
analysis_type = st.selectbox("Select Analysis Type", [ | |
"Descriptive Statistics", | |
"Correlation Analysis", | |
"Hypothesis Testing", | |
"Distribution Fitting" | |
]) | |
if analysis_type == "Descriptive Statistics": | |
st.write(df.describe(include='all')) | |
elif analysis_type == "Correlation Analysis": | |
numeric_cols = df.select_dtypes(include=np.number).columns | |
if len(numeric_cols) >= 2: | |
corr_method = st.selectbox("Correlation Method", [ | |
"Pearson", "Kendall", "Spearman" | |
]) | |
corr_matrix = df[numeric_cols].corr(method=corr_method.lower()) | |
st.write(corr_matrix) | |
st.heatmap(corr_matrix, annot=True, cmap=color_palette) | |
else: | |
st.warning("Need at least 2 numeric columns for correlation analysis") | |
elif analysis_type == "Hypothesis Testing": | |
test_type = st.selectbox("Select Test Type", [ | |
"T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U" | |
]) | |
if test_type == "T-test": | |
col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns) | |
col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns) | |
if st.button("Run T-test"): | |
groups = df.groupby(col2)[col1].apply(list) | |
if len(groups) == 2: | |
t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1]) | |
st.write(f"T-statistic: {t_stat:.4f}") | |
st.write(f"P-value: {p_value:.4f}") | |
if p_value < 0.05: | |
st.write("Reject the null hypothesis.") | |
else: | |
st.write("Fail to reject the null hypothesis.") | |
else: | |
st.write("Select a categorical column with exactly two categories.") | |
elif analysis_type == "Distribution Fitting": | |
numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns) | |
dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"] | |
selected_dist = st.selectbox("Select Distribution Type", dist_types) | |
if st.button("Fit Distribution"): | |
from scipy.stats import norm, lognorm, expon, gamma | |
dist_functions = { | |
"Normal": norm, | |
"Log-Normal": lognorm, | |
"Exponential": expon, | |
"Gamma": gamma | |
} | |
params = dist_functions[selected_dist].fit(df[numeric_col].dropna()) | |
st.write(f"Fitted Parameters: {params}") | |
# Data Profiling Section | |
with st.expander("📝 Generate Full Data Profile", expanded=False): | |
if st.button("🚀 Generate Comprehensive Report"): | |
with st.spinner("Generating report..."): | |
pr = ProfileReport(df, explorative=True) | |
st_profile_report(pr) | |
# Insights Section | |
elif app_mode == "Insights": | |
st.title("📊 Model Insights & Explainability") | |
st.markdown(""" | |
**Understand and Interpret Your Model** with advanced explainability tools and visualizations. | |
Gain deeper insights into model behavior and predictions. | |
""") | |
if 'model' not in st.session_state or st.session_state.model is None: | |
st.warning("Please train a model in the Model Training section first.") | |
st.stop() | |
model = st.session_state.model.steps[-1][1] # Get the trained model | |
preprocessor = st.session_state.model.steps[0][1] # Get the preprocessor | |
# Model Summary | |
st.subheader("📝 Model Summary") | |
st.write(f"**Model Type:** {type(model).__name__}") | |
st.write(f"**Problem Type:** {'Regression' if hasattr(model, 'predict') else 'Classification'}") | |
st.write(f"**Training Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
# Feature Importance | |
st.subheader("🔍 Feature Importance") | |
if hasattr(model, 'feature_importances_'): | |
importances = model.feature_importances_ | |
feature_names = preprocessor.get_feature_names_out() | |
importance_df = pd.DataFrame({ | |
'Feature': feature_names, | |
'Importance': importances | |
}).sort_values('Importance', ascending=False) | |
fig, ax = plt.subplots() | |
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), ax=ax) | |
ax.set_title('Top 10 Feature Importances') | |
st.pyplot(fig) | |
else: | |
st.info("Feature importance not available for this model type.") | |
# SHAP Values | |
st.subheader("📊 SHAP Values") | |
if st.checkbox("Calculate SHAP Values (Warning: May be slow for large datasets)"): | |
try: | |
import shap | |
explainer = shap.TreeExplainer(model) | |
shap_values = explainer.shap_values(st.session_state.X_test_selected) | |
# Summary Plot | |
st.write("### Summary Plot") | |
fig, ax = plt.subplots() | |
shap.summary_plot(shap_values, st.session_state.X_test_selected, feature_names=preprocessor.get_feature_names_out()) | |
st.pyplot(fig) | |
# Force Plot for Individual Predictions | |
st.write("### Individual Prediction Explanation") | |
sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected)-1, 0) | |
fig, ax = plt.subplots() | |
shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx], | |
feature_names=preprocessor.get_feature_names_out(), matplotlib=True, show=False) | |
st.pyplot(fig) | |
except Exception as e: | |
st.error(f"SHAP calculation failed: {e}") | |
# Partial Dependence Plots | |
st.subheader("📈 Partial Dependence Plots") | |
if hasattr(model, 'predict'): | |
feature_to_plot = st.selectbox("Select Feature for PDP", preprocessor.get_feature_names_out()) | |
if st.button("Generate PDP"): | |
from sklearn.inspection import PartialDependenceDisplay | |
fig, ax = plt.subplots() | |
PartialDependenceDisplay.from_estimator( | |
model, st.session_state.X_test_selected, | |
features=[feature_to_plot], | |
feature_names=preprocessor.get_feature_names_out(), | |
ax=ax | |
) | |
st.pyplot(fig) | |
# Model Performance Over Time | |
st.subheader("⏳ Model Performance Over Time") | |
if st.checkbox("Track Performance Over Time"): | |
performance_history = { | |
'timestamp': [], | |
'metric': [], | |
'value': [] | |
} | |
if hasattr(model, 'predict'): | |
y_pred = model.predict(st.session_state.X_test_selected) | |
mse = mean_squared_error(st.session_state.y_test, y_pred) | |
performance_history['timestamp'].append(datetime.now()) | |
performance_history['metric'].append('MSE') | |
performance_history['value'].append(mse) | |
performance_df = pd.DataFrame(performance_history) | |
st.line_chart(performance_df.set_index('timestamp')) | |
# Model Debugging | |
st.subheader("🐛 Model Debugging") | |
if st.checkbox("Enable Debug Mode"): | |
st.write("### Model Parameters") | |
st.json(model.get_params()) | |
st.write("### Training Data Summary") | |
st.write(f"Number of Samples: {st.session_state.X_train_selected.shape[0]}") | |
st.write(f"Number of Features: {st.session_state.X_train_selected.shape[1]}") | |
# Export Insights | |
st.subheader("💾 Export Insights") | |
if st.button("Export Insights as PDF"): | |
try: | |
from fpdf import FPDF | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
pdf.cell(200, 10, txt="Model Insights Report", ln=True, align='C') | |
pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True) | |
pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True) | |
pdf.output("model_insights.pdf") | |
st.success("Insights exported successfully!") | |
except Exception as e: | |
st.error(f"Export failed: {e}") | |
# Predictions Section | |
elif app_mode == "Predictions": | |
st.title("🔮 Prediction Studio") | |
st.markdown(""" | |
**Make Predictions** with your trained model and explore prediction explanations. | |
Generate batch predictions and export results. | |
""") | |
if 'model' not in st.session_state or st.session_state.model is None: | |
st.warning("Please train a model in the Model Training section first.") | |
st.stop() | |
model = st.session_state.model.steps[-1][1] # Get the trained model | |
preprocessor = st.session_state.model.steps[0][1] # Get the preprocessor | |
# Single Prediction | |
st.subheader("🎯 Single Prediction") | |
input_data = {} | |
feature_names = preprocessor.get_feature_names_out() | |
for feature in feature_names: | |
if feature in st.session_state.cleaned_data.columns: | |
if pd.api.types.is_numeric_dtype(st.session_state.cleaned_data[feature]): | |
input_data[feature] = st.number_input(f"Enter {feature}", value=st.session_state.cleaned_data[feature].mean()) | |
else: | |
input_data[feature] = st.selectbox(f"Select {feature}", st.session_state.cleaned_data[feature].unique()) | |
if st.button("Make Prediction"): | |
try: | |
input_df = pd.DataFrame([input_data]) | |
input_processed = preprocessor.transform(input_df) | |
prediction = model.predict(input_processed)[0] | |
st.write(f"**Prediction:** {prediction}") | |
if hasattr(model, 'predict_proba'): | |
probabilities = model.predict_proba(input_processed)[0] | |
st.write("**Prediction Probabilities:**") | |
st.bar_chart(probabilities) | |
# SHAP Explanation | |
if st.checkbox("Show SHAP Explanation"): | |
try: | |
import shap | |
explainer = shap.TreeExplainer(model) | |
shap_values = explainer.shap_values(input_processed) | |
st.write("### SHAP Values") | |
fig, ax = plt.subplots() | |
shap.force_plot(explainer.expected_value, shap_values, input_processed, | |
feature_names=feature_names, matplotlib=True, show=False) | |
st.pyplot(fig) | |
except Exception as e: | |
st.error(f"SHAP calculation failed: {e}") | |
except Exception as e: | |
st.error(f"Prediction failed: {e}") | |
# Batch Predictions | |
st.subheader("📂 Batch Predictions") | |
batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"]) | |
if batch_file is not None: | |
try: | |
batch_df = pd.read_csv(batch_file) | |
batch_processed = preprocessor.transform(batch_df) | |
batch_predictions = model.predict(batch_processed) | |
batch_df['Prediction'] = batch_predictions | |
if hasattr(model, 'predict_proba'): | |
probabilities = model.predict_proba(batch_processed) | |
for i in range(probabilities.shape[1]): | |
batch_df[f'Probability_Class_{i}'] = probabilities[:, i] | |
st.write("### Predictions Preview") | |
st.dataframe(batch_df.head()) | |
# Download Predictions | |
csv = batch_df.to_csv(index=False) | |
b64 = base64.b64encode(csv.encode()).decode() | |
href = f'<a href="data:file/csv;base64,{b64}" download="predictions.csv">Download Predictions CSV</a>' | |
st.markdown(href, unsafe_allow_html=True) | |
except Exception as e: | |
st.error(f"Batch prediction failed: {e}") | |
# Prediction Analysis | |
st.subheader("📊 Prediction Analysis") | |
if st.checkbox("Analyze Predictions"): | |
try: | |
y_pred = model.predict(st.session_state.X_test_selected) | |
y_test = st.session_state.y_test | |
if hasattr(model, 'predict'): | |
fig, ax = plt.subplots() | |
ax.scatter(y_test, y_pred) | |
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) | |
ax.set_xlabel('Actual') | |
ax.set_ylabel('Predicted') | |
ax.set_title('Actual vs Predicted') | |
st.pyplot(fig) | |
else: | |
conf_matrix = confusion_matrix(y_test, y_pred) | |
fig, ax = plt.subplots() | |
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax) | |
ax.set_xlabel('Predicted Labels') | |
ax.set_ylabel('True Labels') | |
ax.set_title('Confusion Matrix') | |
st.pyplot(fig) | |
except Exception as e: | |
st.error(f"Prediction analysis failed: {e}") | |
# Prediction Export | |
st.subheader("💾 Export Predictions") | |
if st.button("Export Predictions as PDF"): | |
try: | |
from fpdf import FPDF | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
pdf.cell(200, 10, txt="Predictions Report", ln=True, align='C') | |
pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True) | |
pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True) | |
pdf.output("predictions_report.pdf") | |
st.success("Predictions exported successfully!") | |
except Exception as e: | |
st.error(f"Export failed: {e}") | |
# Neural Network Studio Section | |
elif app_mode == "Neural Network Studio": | |
st.title("🧠 Neural Network Studio") | |
st.markdown(""" | |
**Build and Train Neural Networks** with advanced configurations and visualizations. | |
Explore deep learning models with ease. | |
""") | |
if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None: | |
st.warning("Please clean your data in the Smart Cleaning section first.") | |
st.stop() | |
df = st.session_state.cleaned_data.copy() | |
# Target Variable Selection | |
st.subheader("🎯 Target Variable") | |
target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.") | |
# Problem Type Selection | |
st.subheader("📝 Problem Type") | |
problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.") | |
# Feature Selection | |
st.subheader("🔧 Feature Selection") | |
use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.") | |
if use_all_features: | |
feature_columns = df.drop(columns=[target_column]).columns.tolist() | |
else: | |
feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.") | |
# Neural Network Configuration | |
st.subheader("⚙️ Neural Network Configuration") | |
with st.expander("Configure Neural Network", expanded=True): | |
col1, col2 = st.columns(2) | |
with col1: | |
hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2) | |
neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50) | |
activation = st.selectbox("Activation Function", | |
["relu", "tanh", "sigmoid", "selu", "swish"]) | |
dropout_rate = st.slider("Dropout Rate", 0.0, 0.5, 0.2) | |
initializer = st.selectbox("Weight Initializer", | |
["glorot_uniform", "he_normal", "lecun_uniform"]) | |
with col2: | |
learning_rate = st.slider("Learning Rate", 0.0001, 0.1, 0.001, format="%.4f") | |
optimizer_choice = st.selectbox("Optimizer", | |
["Adam", "Nadam", "RMSprop", "SGD"]) | |
batch_norm = st.checkbox("Batch Normalization", value=True) | |
regularization = st.checkbox("L2 Regularization") | |
epochs = st.slider("Epochs", 10, 200, 50) | |
batch_size = st.slider("Batch Size", 16, 128, 32) | |
# Train-Test Split | |
st.subheader("✂️ Train-Test Split") | |
test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.") | |
# Model Training | |
if st.button("🚀 Train Neural Network"): | |
with st.spinner("Training neural network..."): | |
try: | |
X = df[feature_columns] | |
y = df[target_column] | |
# Train-Test Split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) | |
# Preprocessing Pipeline | |
numeric_features = X.select_dtypes(include=np.number).columns | |
categorical_features = X.select_dtypes(exclude=np.number).columns | |
numeric_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='median')), | |
('scaler', StandardScaler()) | |
]) | |
categorical_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='most_frequent')), | |
('onehot', OneHotEncoder(handle_unknown='ignore')) | |
]) | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
('num', numeric_transformer, numeric_features), | |
('cat', categorical_transformer, categorical_features) | |
]) | |
X_train_processed = preprocessor.fit_transform(X_train) | |
X_test_processed = preprocessor.transform(X_test) | |
# Build neural network with advanced features | |
model = keras.Sequential() | |
model.add(layers.Input(shape=(X_train_processed.shape[1],))) | |
for _ in range(hidden_layers): | |
# Create configurable layers | |
layer_config = { | |
'units': neurons_per_layer, | |
'activation': activation, | |
'kernel_initializer': initializer | |
} | |
if regularization: | |
layer_config['kernel_regularizer'] = keras.regularizers.l2(0.01) | |
model.add(layers.Dense(**layer_config)) | |
if batch_norm: | |
model.add(layers.BatchNormalization()) | |
if dropout_rate > 0: | |
model.add(layers.Dropout(dropout_rate)) | |
# Output layer | |
output_activation = 'linear' if problem_type == "Regression" else 'softmax' | |
output_units = 1 if problem_type == "Regression" else len(np.unique(y_train)) | |
model.add(layers.Dense(output_units, activation=output_activation)) | |
# Configure optimizer | |
optimizers = { | |
"Adam": keras.optimizers.Adam(learning_rate=learning_rate), | |
"Nadam": keras.optimizers.Nadam(learning_rate=learning_rate), | |
"RMSprop": keras.optimizers.RMSprop(learning_rate=learning_rate), | |
"SGD": keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9) | |
} | |
optimizer = optimizers[optimizer_choice] | |
# Compile the model | |
model.compile(optimizer=optimizer, | |
loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy', | |
metrics=['mae'] if problem_type == "Regression" else ['accuracy']) | |
# Add callbacks section | |
with st.expander("Advanced Training Options"): | |
early_stopping = st.checkbox("Early Stopping", value=True) | |
reduce_lr = st.checkbox("Reduce Learning Rate on Plateau") | |
patience = st.slider("Patience Epochs", 5, 20, 10) if early_stopping else 0 | |
callbacks_list = [] | |
if early_stopping: | |
callbacks_list.append( | |
callbacks.EarlyStopping(patience=patience, restore_best_weights=True)) | |
if reduce_lr: | |
callbacks_list.append( | |
callbacks.ReduceLROnPlateau(factor=0.2, patience=patience//2)) | |
# Train the model with callbacks | |
history = model.fit( | |
X_train_processed, y_train, | |
epochs=epochs, | |
batch_size=batch_size, | |
validation_split=0.2, | |
callbacks=callbacks_list, | |
verbose=0 | |
) | |
# Store model and preprocessor | |
st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)]) | |
st.session_state.preprocessor = preprocessor | |
# Store the test data | |
st.session_state.X_train_selected = X_train_processed | |
st.session_state.X_test_selected = X_test_processed | |
st.session_state.y_train = y_train | |
st.session_state.y_test = y_test | |
# Model Evaluation | |
y_pred = model.predict(X_test_processed) | |
# Post-processing for classification | |
if problem_type == "Classification": | |
y_pred = np.argmax(y_pred, axis=1) # Convert probabilities to class labels | |
if problem_type == "Regression": | |
mse = mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
mae = mean_absolute_error(y_test, y_pred) | |
r2 = r2_score(y_test, y_pred) | |
st.write(f"Mean Squared Error: {mse:.4f}") | |
st.write(f"Root Mean Squared Error: {rmse:.4f}") | |
st.write(f"Mean Absolute Error: {mae:.4f}") | |
st.write(f"R-squared: {r2:.4f}") | |
else: | |
accuracy = accuracy_score(y_test, y_pred) | |
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) | |
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) | |
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0) | |
st.write(f"Accuracy: {accuracy:.4f}") | |
st.write(f"Precision: {precision:.4f}") | |
st.write(f"Recall: {recall:.4f}") | |
st.write(f"F1 Score: {f1:.4f}") | |
st.write("Classification Report:") | |
st.text(classification_report(y_test, y_pred)) | |
# Visualization with multiple metrics | |
st.subheader("📊 Training History") | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) | |
# Plot loss | |
ax1.plot(history.history['loss'], label='Train Loss') | |
ax1.plot(history.history['val_loss'], label='Validation Loss') | |
ax1.set_title('Loss Evolution') | |
ax1.set_xlabel('Epoch') | |
ax1.set_ylabel('Loss') | |
ax1.legend() | |
# Plot accuracy/metric | |
if problem_type == "Classification": | |
ax2.plot(history.history['accuracy'], label='Train Accuracy') | |
ax2.plot(history.history['val_accuracy'], label='Validation Accuracy') | |
ax2.set_title('Accuracy Evolution') | |
ax2.set_ylabel('Accuracy') | |
else: | |
ax2.plot(history.history['mae'], label='Train MAE') | |
ax2.plot(history.history['val_mae'], label='Validation MAE') | |
ax2.set_title('MAE Evolution') | |
ax2.set_ylabel('MAE') | |
ax2.set_xlabel('Epoch') | |
ax2.legend() | |
st.pyplot(fig) | |
st.success("Neural network trained successfully!") | |
except Exception as e: | |
st.error(f"An error occurred during training: {e}") | |
# Model Saving | |
if st.session_state.model is not None: | |
st.subheader("💾 Save Model") | |
model_filename = st.text_input("Enter Model Filename (without extension)", "neural_network") | |
if st.button("Save Model"): | |
try: | |
# Save the entire Keras model including architecture and weights | |
st.session_state.model.named_steps['model'].save(f"{model_filename}.h5") # Saves as a HDF5 file | |
st.success(f"Model saved as {model_filename}.h5") | |
except Exception as e: | |
st.error(f"Error saving model: {e}") |