import os, tempfile import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas.api.types import is_datetime64_any_dtype as is_datetime from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error, r2_score from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor import gradio as gr # ---------- Helpers ---------- def infer_target_column(df: pd.DataFrame): for c in ["power_usage_kwh", "energy_kwh", "power_kwh", "energy"]: if c in df.columns: return c raise ValueError("Target column not found. Expected one of: " "['power_usage_kwh','energy_kwh','power_kwh','energy'].") def ensure_datetime_naive(df: pd.DataFrame, tz_target: str = "Asia/Dubai"): if "timestamp" not in df.columns: return df # Parse robustly with UTC, then convert to target tz and drop tz ts = pd.to_datetime(df["timestamp"], errors="coerce", utc=True) try: ts = ts.dt.tz_convert(tz_target).dt.tz_localize(None) except Exception: try: ts = ts.dt.tz_localize(None) except Exception: pass df = df.copy() df["timestamp"] = ts return df def feature_engineer(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df = ensure_datetime_naive(df, tz_target="Asia/Dubai") # Light numeric imputation num_cols = df.select_dtypes(include=[np.number]).columns df[num_cols] = df[num_cols].ffill().bfill() # Time features if "timestamp" in df.columns and is_datetime(df["timestamp"]): df["hour"] = df["timestamp"].dt.hour df["dayofweek"] = df["timestamp"].dt.dayofweek df["is_weekend"] = (df["dayofweek"] >= 5).astype(int) df["month"] = df["timestamp"].dt.month df["dayofyear"] = df["timestamp"].dt.dayofyear df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24) df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24) df["dow_sin"] = np.sin(2*np.pi*df["dayofweek"]/7) df["dow_cos"] = np.cos(2*np.pi*df["dayofweek"]/7) else: for c in ["hour","dayofweek","is_weekend","month","dayofyear","hour_sin","hour_cos","dow_sin","dow_cos"]: if c not in df.columns: df[c] = 0 # Domain features tgt = infer_target_column(df) if "cooling_eff_pct" in df.columns: df["cooling_ineff_pct"] = 100 - df["cooling_eff_pct"] if "server_load_pct" in df.columns: df["energy_per_load"] = df[tgt] / np.maximum(df["server_load_pct"], 1) if "ambient_temp_c" in df.columns and "server_load_pct" in df.columns: df["temp_load_interaction"] = df["ambient_temp_c"] * df["server_load_pct"] # Target lags/rollings df["target_lag1"] = df[tgt].shift(1) df["target_roll3"] = df[tgt].rolling(3, min_periods=1).mean() df["target_roll24"] = df[tgt].rolling(24, min_periods=1).mean() # Fill NaNs from shifts df = df.ffill().bfill() return df def get_model(name: str): return GradientBoostingRegressor(random_state=42) if name == "Gradient Boosting" \ else RandomForestRegressor(n_estimators=300, random_state=42) def feature_target_split(df: pd.DataFrame): y_col = infer_target_column(df) X = df.drop(columns=[c for c in [y_col, "timestamp"] if c in df.columns], errors="ignore") X = X.select_dtypes(include=[np.number]).copy() y = df[y_col].astype(float) return X, y, y_col # ---------- Core pipeline ---------- def run_pipeline(file_path, model_name): title = "⚡ AI-Driven Data Center Energy Optimization Dashboard" try: if not file_path: return (title, "Please upload a CSV file.", None, None, None, None, None, None) df_raw = pd.read_csv(file_path) df = feature_engineer(df_raw) # Guardrail if len(df) < 10: return (title, "Not enough rows to train a model (need >= 10).", None, None, None, None, None, None) X, y, y_col = feature_target_split(df) # Split, train, predict test_size = 0.25 if len(df) >= 25 else 0.2 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42 ) model = get_model(model_name) model.fit(X_train, y_train) y_pred_all = model.predict(X) y_pred_test = model.predict(X_test) mae = mean_absolute_error(y_test, y_pred_test) r2 = r2_score(y_test, y_pred_test) avg_actual = float(np.mean(y)) avg_pred = float(np.mean(y_pred_all)) # ------ Visualizations ------ ts_plot = None if "timestamp" in df.columns and is_datetime(df["timestamp"]): plot_df = df.copy().sort_values("timestamp") Xp = plot_df.drop(columns=[c for c in [y_col, "timestamp"] if c in plot_df.columns], errors="ignore") Xp = Xp.select_dtypes(include=[np.number]).copy() yp = model.predict(Xp) ts_plot = plt.figure(figsize=(9, 3.6)) plt.plot(plot_df["timestamp"], plot_df[y_col], label="Actual") plt.plot(plot_df["timestamp"], yp, label="Predicted") plt.title("Time Series: Actual vs Predicted") plt.xlabel("Time"); plt.ylabel(y_col) plt.legend(); plt.tight_layout() sc_plot = plt.figure(figsize=(4.6, 3.8)) plt.scatter(y_test, y_pred_test, alpha=0.6) mn = min(y_test.min(), y_pred_test.min()); mx = max(y_test.max(), y_pred_test.max()) plt.plot([mn, mx], [mn, mx], linestyle="--") plt.title("Holdout: Actual vs Predicted") plt.xlabel("Actual"); plt.ylabel("Predicted") plt.tight_layout() res = y_test - y_pred_test resid_plot = plt.figure(figsize=(4.6, 3.6)) plt.hist(res, bins=30) plt.title("Holdout Residuals (Actual − Predicted)") plt.xlabel("Residual"); plt.ylabel("Count") plt.tight_layout() fi_plot = None if hasattr(model, "feature_importances_"): importances = model.feature_importances_ fi = (pd.DataFrame({"feature": X.columns, "importance": importances}) .sort_values("importance", ascending=False).head(12)) fi_plot = plt.figure(figsize=(6.2, 3.8)) plt.barh(fi["feature"][::-1], fi["importance"][::-1]) plt.title("Top Feature Importances") plt.tight_layout() # Save predictions for download out_df = df.copy() out_df[f"{y_col}_pred"] = y_pred_all tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") out_df.to_csv(tmp.name, index=False) # --------- Copy text (explainer + KPIs) --------- explainer = ( "### 🧠 What this app does\n" "This AI-driven dashboard learns the relationship between **server load**, **ambient temperature**, " "**cooling efficiency**, and time features to **predict power usage**. " "Use it to quantify drivers of energy consumption, monitor deviations, and surface optimization levers.\n\n" "### 🔎 Why it matters\n" "- Reduces **OPEX** by forecasting and optimizing energy usage\n" "- Identifies high-impact drivers (feature importance)\n" "- Enables proactive actions (e.g., workload shaping, cooling set-point tuning)\n\n" "### ⚙️ How it works (high-level)\n" "1) Cleans and engineers features (diurnal/weekly cycles, rolling stats, domain signals)\n" "2) Trains a tree ensemble (Gradient Boosting or Random Forest)\n" "3) Evaluates on a holdout split and produces predictions for the entire dataset\n" "4) Visualizes time series, accuracy scatter, residuals, and top feature importance\n" ) kpis = ( f"**Model:** {model_name}\n\n" f"**Target:** {y_col}\n" f"**Avg {y_col} (actual):** {avg_actual:,.2f}\n" f"**Avg {y_col} (predicted):** {avg_pred:,.2f}\n" f"**Rows:** {len(df):,}\n\n" f"**Holdout MAE:** {mae:,.2f} | **R²:** {r2:,.3f}" ) # Sample preview table preview = out_df.head(10) return ( title, explainer, kpis, preview, ts_plot, sc_plot, resid_plot, fi_plot, tmp.name ) except Exception as e: err = f"❌ **Error:** {type(e).__name__}: {e}" return (title, err, None, None, None, None, None, None, None) # ---------- Gradio UI ---------- import gradio gradio.close_all() # avoid port conflicts in Colab with gr.Blocks(title="AI-Driven Data Center Energy Optimization") as demo: gr.Markdown("## ⚡ AI-Driven Data Center Energy Optimization Dashboard") with gr.Row(): fpath = gr.File(label="📁 Upload Dataset (CSV)", file_types=[".csv"], type="filepath") model_name = gr.Dropdown( choices=["Gradient Boosting", "Random Forest"], value="Gradient Boosting", label="🔍 Select Model" ) run_btn = gr.Button("▶️ Run") title_out = gr.Markdown() explainer_out = gr.Markdown() kpi_out = gr.Markdown() table_out = gr.Dataframe(label="📋 Sample (+ Predictions)", wrap=True, row_count=("fixed", 10)) gr.Markdown("### 📈 Visual Insights") ts_plot = gr.Plot(label="Time Series: Actual vs Predicted") sc_plot = gr.Plot(label="Holdout: Actual vs Predicted") resid_plot = gr.Plot(label="Residuals (Histogram)") fi_plot = gr.Plot(label="Top Feature Importances") dl = gr.File(label="📥 Download Data (+ Predictions)") run_btn.click( fn=run_pipeline, inputs=[fpath, model_name], outputs=[title_out, explainer_out, kpi_out, table_out, ts_plot, sc_plot, resid_plot, fi_plot, dl] ) demo.launch(share=True)