| import pandas as pd
|
| import numpy as np
|
| import gradio as gr
|
| from pathlib import Path
|
|
|
| ROUND = 4
|
|
|
| def load_dataset(file):
|
| """
|
| Load CSV or Excel file.
|
| Returns:
|
| df, status_message
|
| """
|
| if file is None:
|
| return None, "No file uploaded."
|
|
|
| try:
|
| path = Path(file.name)
|
|
|
| if path.suffix == ".csv":
|
| df = pd.read_csv(path)
|
| elif path.suffix in [".xlsx", ".xls"]:
|
| df = pd.read_excel(path)
|
| else:
|
| return None, "Unsupported file format."
|
|
|
| return df, f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns."
|
|
|
| except Exception as e:
|
| return None, f"Error loading file: {e}"
|
|
|
|
|
| def dataset_summary(df: pd.DataFrame):
|
| if df is None:
|
| return None
|
|
|
| summary = (
|
| df.describe(include="all")
|
| .transpose()
|
| .reset_index()
|
| .rename(columns={"index": "variable"})
|
| )
|
|
|
|
|
| summary["unique"] = df.nunique(dropna=True).values
|
|
|
|
|
| desired_order = [
|
| "variable",
|
| "count",
|
| "unique",
|
| "mean",
|
| "std",
|
| "min",
|
| "25%",
|
| "50%",
|
| "75%",
|
| "max",
|
| ]
|
| summary = summary[[c for c in desired_order if c in summary.columns]]
|
|
|
|
|
|
|
| for col in summary.columns:
|
| if col not in ["variable", "count", "unique"]:
|
| summary[col] = summary[col].apply(
|
| lambda x: f"{x:.{ROUND}f}" if isinstance(x, (int, float)) else x
|
| )
|
|
|
| return summary
|
|
|
|
|
| def variable_types(df):
|
| if df is None:
|
| return None
|
|
|
| return (
|
| df.dtypes
|
| .reset_index()
|
| .rename(columns={"index": "Variable", 0: "Type"})
|
| )
|
|
|
|
|
| def column_choices_single(cols: list[str]):
|
| return gr.update(choices=cols, value=None)
|
|
|
|
|
| def column_choices_multi(cols: list[str]):
|
| return gr.update(choices=cols, value=[])
|
|
|
|
|
| def category_value_choices(df, col):
|
| if df is None or col is None or col not in df.columns:
|
| return gr.update(visible=False, choices=[], value=[])
|
|
|
| values = sorted(df[col].dropna().unique().tolist())
|
|
|
| return gr.update(
|
| visible=True,
|
| choices=values,
|
| value=[],
|
| )
|
|
|
|
|
| def infer_column_types(df: pd.DataFrame):
|
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
|
|
|
| return sorted(numeric_cols), sorted(categorical_cols)
|
|
|
|
|
| def apply_category_filters(
|
| df,
|
| cat_cols,
|
| val1,
|
| val2,
|
| val3,
|
| ):
|
| if df is None:
|
| return None, "❌ No data loaded."
|
|
|
| if not cat_cols or all(not v for v in [val1, val2, val3]):
|
| return df.copy(), "⚠️ No filters selected. Using full dataset."
|
|
|
| filtered_df = df.copy()
|
|
|
| values = [val1, val2, val3]
|
|
|
| for col, selected_vals in zip(cat_cols[:3], values):
|
| if selected_vals:
|
| filtered_df = filtered_df[filtered_df[col].isin(selected_vals)]
|
|
|
| return filtered_df, f"✅ Filter applied. Rows remaining: {len(filtered_df)}"
|
|
|
| def reclassify_as_categorical(state, column):
|
| if column and column in state.numeric_cols:
|
| state.numeric_cols.remove(column)
|
| state.categorical_cols.append(column)
|
| state.active_filters = {}
|
| return True, f"Column '{column}' reclassified as categorical."
|
| return False, f"Column '{column}' is not numeric."
|
|
|
|
|
| def reclassify_as_numeric(state, column):
|
| if column and column in state.categorical_cols:
|
| state.categorical_cols.remove(column)
|
| state.numeric_cols.append(column)
|
| state.active_filters = {}
|
| return True, f"Column '{column}' reclassified as numeric."
|
| return False, f"Column '{column}' is not categorical."
|
|
|