Spaces:
Sleeping
Sleeping
| """ | |
| eda.py | |
| ------ | |
| Exploratory Data Analysis for MNREGA unified dataset. | |
| Automatically adapts to Maharashtra-only or All-India data. | |
| Figures produced: | |
| 01_statewide_trend.png | |
| 02_district_performance_ranking.png | |
| 03_efficiency_ranking.png | |
| 04_covid_impact.png | |
| 05_correlation_heatmap.png | |
| """ | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import matplotlib.font_manager as fm | |
| import seaborn as sns | |
| FIGURES_DIR = os.path.join("reports", "figures") | |
| os.makedirs(FIGURES_DIR, exist_ok=True) | |
| sns.set_theme(style="whitegrid", palette="muted") | |
| plt.rcParams.update({"figure.dpi": 120, "font.size": 10}) | |
| # Use a font that supports the rupee symbol if available, else fallback | |
| def _get_font(): | |
| available = [f.name for f in fm.fontManager.ttflist] | |
| for font in ["DejaVu Sans", "FreeSans", "Liberation Sans", "Arial"]: | |
| if font in available: | |
| return font | |
| return None | |
| FONT = _get_font() | |
| if FONT: | |
| plt.rcParams["font.family"] = FONT | |
| def run_eda(df: pd.DataFrame, scope: str = "Maharashtra") -> None: | |
| print(f"\n[eda] Starting EDA β scope: {scope}") | |
| _summary_stats(df) | |
| _plot_trend(df, scope) | |
| _plot_top_bottom_districts(df, scope) | |
| _plot_efficiency_ranking(df, scope) | |
| _plot_covid_impact(df) | |
| _plot_correlation_heatmap(df) | |
| print(f"[eda] All figures saved to: {FIGURES_DIR}/") | |
| # ββ 1. Summary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _summary_stats(df: pd.DataFrame) -> None: | |
| print(f"\n[eda] {'β'*50}") | |
| print(f"[eda] Rows : {len(df)}") | |
| print(f"[eda] States : {df['state'].nunique()}") | |
| print(f"[eda] Districts : {df['district'].nunique()}") | |
| print(f"[eda] Years : {df['financial_year'].min()} β {df['financial_year'].max()}") | |
| print(f"[eda] Total persondays: {df['person_days_lakhs'].sum():,.1f} lakh") | |
| if "expenditure_lakhs" in df.columns: | |
| print(f"[eda] Total expenditure: Rs. {df['expenditure_lakhs'].sum():,.1f} lakh") | |
| print(f"\n[eda] Person days by year (state-aggregated mean):") | |
| by_year = df.groupby("financial_year")["person_days_lakhs"].mean() | |
| max_val = by_year.max() | |
| for yr, val in by_year.items(): | |
| bar = "β" * int(val / max_val * 28) | |
| print(f" {yr}: {bar} {val:.2f}") | |
| print(f"[eda] {'β'*50}") | |
| # ββ 2. Trend ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _plot_trend(df: pd.DataFrame, scope: str) -> None: | |
| yearly = df.groupby("financial_year").agg( | |
| total_persondays=("person_days_lakhs", "sum"), | |
| ).reset_index() | |
| fig, ax1 = plt.subplots(figsize=(11, 5)) | |
| ax1.bar(yearly["financial_year"], yearly["total_persondays"], | |
| color="#2196F3", alpha=0.75, label="Person Days (lakh)") | |
| ax1.set_ylabel("Total Person Days (lakh)", color="#2196F3") | |
| ax1.tick_params(axis="y", labelcolor="#2196F3") | |
| ax1.set_xlabel("Financial Year") | |
| plt.title(f"MNREGA Trend β {scope} (Person Days)") | |
| fig.tight_layout() | |
| _save("01_statewide_trend.png") | |
| # ββ 3. District rankings ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _plot_top_bottom_districts(df: pd.DataFrame, scope: str) -> None: | |
| avg = df.groupby("district")["person_days_lakhs"].mean().sort_values(ascending=False) | |
| n = min(10, len(avg) // 2) | |
| top = avg.head(n) | |
| bot = avg.tail(n).sort_values() | |
| fig, axes = plt.subplots(1, 2, figsize=(14, max(5, n * 0.55))) | |
| axes[0].barh(top.index, top.values, color="#4CAF50") | |
| axes[0].set_title(f"Top {n} Districts") | |
| axes[0].set_xlabel("Avg Person Days (lakh)") | |
| axes[0].invert_yaxis() | |
| axes[1].barh(bot.index, bot.values, color="#FF7043") | |
| axes[1].set_title(f"Bottom {n} Districts") | |
| axes[1].set_xlabel("Avg Person Days (lakh)") | |
| axes[1].invert_yaxis() | |
| plt.suptitle(f"MNREGA District Performance β {scope}", fontsize=13) | |
| plt.tight_layout() | |
| _save("02_district_performance_ranking.png") | |
| print(f"\n[eda] Top 5 districts:") | |
| for d, v in avg.head(5).items(): | |
| print(f" {d:35s}: {v:.2f} lakh") | |
| print(f"[eda] Bottom 5 districts:") | |
| for d, v in avg.tail(5).items(): | |
| print(f" {d:35s}: {v:.2f} lakh") | |
| # ββ 4. Efficiency ranking βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _plot_efficiency_ranking(df: pd.DataFrame, scope: str) -> None: | |
| if "expenditure_per_personday" not in df.columns: | |
| print("[eda] Skipping efficiency ranking β expenditure_per_personday not in V3 features") | |
| return | |
| eff = ( | |
| df.groupby("district")["expenditure_per_personday"] | |
| .mean().sort_values().dropna() | |
| ) | |
| if len(eff) > 30: | |
| eff = pd.concat([eff.head(15), eff.tail(15)]) | |
| fig, ax = plt.subplots(figsize=(10, max(6, len(eff) * 0.3))) | |
| colors = ["#43A047" if v <= eff.median() else "#EF5350" for v in eff.values] | |
| ax.barh(eff.index, eff.values, color=colors) | |
| ax.axvline(eff.median(), color="navy", linestyle="--", | |
| linewidth=1.5, label=f"Median: {eff.median():.1f}") | |
| ax.set_title(f"Cost Efficiency β {scope}\n(Rs. expenditure per lakh persondays β lower is better)") | |
| ax.set_xlabel("Rs. lakh per lakh persondays") | |
| ax.legend() | |
| plt.tight_layout() | |
| _save("03_efficiency_ranking.png") | |
| print(f"\n[eda] Most efficient : {eff.idxmin()} ({eff.min():.1f})") | |
| print(f"[eda] Least efficient: {eff.idxmax()} ({eff.max():.1f})") | |
| # ββ 5. COVID impact βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _plot_covid_impact(df: pd.DataFrame) -> None: | |
| pre = df[df["financial_year"] == 2019].groupby("district")["person_days_lakhs"].mean() | |
| post = df[df["financial_year"] == 2020].groupby("district")["person_days_lakhs"].mean() | |
| common = pre.index.intersection(post.index) | |
| change = ((post[common] - pre[common]) / pre[common] * 100).sort_values(ascending=False) | |
| # Cap at 20 districts for readability | |
| show = pd.concat([change.head(10), change.tail(10)]) if len(change) > 20 else change | |
| fig, ax = plt.subplots(figsize=(10, max(6, len(show) * 0.35))) | |
| colors = ["#388E3C" if v >= 0 else "#D32F2F" for v in show.values] | |
| ax.barh(show.index, show.values, color=colors) | |
| ax.axvline(0, color="black", linewidth=0.8) | |
| ax.set_title("COVID Impact: % Change in Person Days\n(2019-20 to 2020-21)") | |
| ax.set_xlabel("% Change") | |
| plt.tight_layout() | |
| _save("04_covid_impact.png") | |
| print(f"\n[eda] COVID β biggest spike : {change.idxmax()} (+{change.max():.1f}%)") | |
| print(f"[eda] COVID β least impacted : {change.idxmin()} ({change.min():.1f}%)") | |
| # ββ 6. Correlation heatmap ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _plot_correlation_heatmap(df: pd.DataFrame) -> None: | |
| candidates = [ | |
| "person_days_lakhs", "expenditure_lakhs", "avg_wage_rate", | |
| "expenditure_per_personday", "lag_person_days", "yoy_growth", | |
| "demand_fulfillment_rate", "district_avg_persondays", | |
| "rainfall_mm", "poverty_rate_pct", "scheme_overlap_score", | |
| "budget_utilization_rate" | |
| ] | |
| cols = [c for c in candidates if c in df.columns] | |
| corr = df[cols].corr() | |
| fig, ax = plt.subplots(figsize=(11, 9)) | |
| mask = np.triu(np.ones_like(corr, dtype=bool)) | |
| sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", | |
| cmap="coolwarm", center=0, ax=ax, | |
| linewidths=0.5, annot_kws={"size": 8}) | |
| ax.set_title("Feature Correlation Heatmap") | |
| plt.tight_layout() | |
| _save("05_correlation_heatmap.png") | |
| # ββ Helper ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _save(filename: str) -> None: | |
| path = os.path.join(FIGURES_DIR, filename) | |
| plt.savefig(path, bbox_inches="tight") | |
| plt.close() | |
| print(f"[eda] Saved: {path}") | |