Python_Project_2

Sleeping

App Files Files Community

rohan965 commited on Mar 7

Commit

ef3bd7e

verified ·

1 Parent(s): d3e5881

Create app.py

Browse files

Files changed (1) hide show

app.py +448 -0

app.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# app.py
+# ============================================================
+# Hugging Face Docker Space (Gradio) - Hotel Cancellation Project
+# 3 Tabs:
+#   1) Run Pipeline + Execution Logs
+#   2) Results + Visualizations (Python + R)
+#   3) Predict Cancellation Probability (Python RF + R LASSO)
+#
+# Repo must contain:
+#   booking.csv
+#   1_Data_Creation.ipynb
+#   2_Python_Analysis.ipynb
+#   3_R_Analysis.ipynb
+#   requirements.txt
+#   Dockerfile (installs R + IRkernel + needed R packages)
+#
+# Generated by notebooks:
+#   hotel_cancel_model_dataset.csv, features.json, dataset_meta.json, train.csv, test.csv
+#   artifacts/py/... and artifacts/r/...
+# ============================================================
+import json
+import os
+import subprocess
+from pathlib import Path
+from typing import Dict, Any, Tuple, Optional
+import pandas as pd
+import gradio as gr
+import joblib
+# ============================================================
+# 0) Config (YOUR notebook filenames)
+# ============================================================
+BASE_DIR = Path.cwd()
+DATA_NOTEBOOK = "1_Data_Creation.ipynb"
+PY_NOTEBOOK = "2_Python_Analysis.ipynb"
+R_NOTEBOOK = "3_R_Analysis.ipynb"
+RUNS_DIR = BASE_DIR / "runs"
+RUNS_DIR.mkdir(exist_ok=True)
+DATASET_PATH = BASE_DIR / "hotel_cancel_model_dataset.csv"
+FEATURES_PATH = BASE_DIR / "features.json"
+PY_MODEL_PATH = BASE_DIR / "artifacts" / "py" / "models" / "model.joblib"
+R_MODEL_PATH = BASE_DIR / "artifacts" / "r" / "models" / "model.rds"
+R_METRICS_PATH = BASE_DIR / "artifacts" / "r" / "metrics" / "metrics.json"
+# ============================================================
+# 1) Notebook execution helpers
+# ============================================================
+def _run_notebook(nb_name: str, out_name: str) -> str:
+    """
+    Execute a notebook using papermill and return a log string.
+    """
+    nb_in = BASE_DIR / nb_name
+    nb_out = RUNS_DIR / out_name
+    if not nb_in.exists():
+        return f"❌ Notebook not found: {nb_in}\nCheck the filename in app.py."
+    # Choose kernel
+    # - Python notebooks: python3
+    # - R notebook: ir  (installed via IRkernel in Dockerfile)
+    kernel = "python3"
+    if nb_name == R_NOTEBOOK:
+        kernel = os.environ.get("R_KERNEL_NAME", "ir")
+    cmd = ["papermill", str(nb_in), str(nb_out), "-k", kernel]
+    try:
+        proc = subprocess.run(cmd, capture_output=True, text=True, check=False)
+        parts = []
+        parts.append(f"▶ Running: {nb_name}")
+        parts.append(f"▶ Kernel : {kernel}")
+        parts.append(f"▶ Output : {nb_out.name}")
+        parts.append("")
+        if proc.stdout:
+            parts.append("----- STDOUT -----")
+            parts.append(proc.stdout)
+        if proc.stderr:
+            parts.append("----- STDERR -----")
+            parts.append(proc.stderr)
+        parts.append("")
+        parts.append(f"✅ Return code: {proc.returncode}")
+        return "\n".join(parts)
+    except Exception as e:
+        return f"❌ Failed to execute {nb_name}: {repr(e)}"
+def run_data_prep() -> str:
+    return _run_notebook(DATA_NOTEBOOK, "1_Data_Creation_RUN.ipynb")
+def run_python_model() -> str:
+    return _run_notebook(PY_NOTEBOOK, "2_Python_Analysis_RUN.ipynb")
+def run_r_model() -> str:
+    return _run_notebook(R_NOTEBOOK, "3_R_Analysis_RUN.ipynb")
+def run_all() -> str:
+    logs = []
+    logs.append(run_data_prep())
+    logs.append("\n" + "=" * 80 + "\n")
+    logs.append(run_python_model())
+    logs.append("\n" + "=" * 80 + "\n")
+    logs.append(run_r_model())
+    return "\n".join(logs)
+# ============================================================
+# 2) Safe file readers for Results tab
+# ============================================================
+def _safe_read_json(path: Path) -> Optional[Dict[str, Any]]:
+    if not path.exists():
+        return None
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except Exception:
+        return None
+def _safe_read_csv(path: Path, nrows: Optional[int] = None) -> Optional[pd.DataFrame]:
+    if not path.exists():
+        return None
+    try:
+        return pd.read_csv(path, nrows=nrows)
+    except Exception:
+        return None
+def load_results():
+    """
+    Load latest artifacts from artifacts/py and artifacts/r.
+    Returns values in the order used by the Gradio outputs.
+    """
+    # Python artifacts
+    py_metrics = _safe_read_json(BASE_DIR / "artifacts" / "py" / "metrics" / "metrics.json") or {}
+    py_conf = str(BASE_DIR / "artifacts" / "py" / "figures" / "confusion_matrix.png")
+    py_roc = str(BASE_DIR / "artifacts" / "py" / "figures" / "roc_curve.png")
+    py_fi = _safe_read_csv(BASE_DIR / "artifacts" / "py" / "tables" / "feature_importances.csv") or pd.DataFrame()
+    py_pred = _safe_read_csv(BASE_DIR / "artifacts" / "py" / "tables" / "test_predictions.csv", nrows=50) or pd.DataFrame()
+    # R artifacts
+    r_metrics = _safe_read_json(BASE_DIR / "artifacts" / "r" / "metrics" / "metrics.json") or {}
+    r_roc = str(BASE_DIR / "artifacts" / "r" / "figures" / "roc_curve.png")
+    r_coef = _safe_read_csv(BASE_DIR / "artifacts" / "r" / "tables" / "coefficients.csv", nrows=50) or pd.DataFrame()
+    r_pred = _safe_read_csv(BASE_DIR / "artifacts" / "r" / "tables" / "test_predictions.csv", nrows=50) or pd.DataFrame()
+    return py_metrics, r_metrics, py_conf, py_roc, r_roc, py_fi, r_coef, py_pred, r_pred
+# ============================================================
+# 3) Prediction (Python + R)
+# ============================================================
+def _load_schema() -> Dict[str, Any]:
+    if not FEATURES_PATH.exists():
+        raise FileNotFoundError("features.json not found. Run the Data Creation notebook first.")
+    with open(FEATURES_PATH, "r", encoding="utf-8") as f:
+        return json.load(f)
+def _predict_python(py_model, features: Dict[str, Any]) -> float:
+    """
+    Predict cancellation probability using sklearn pipeline (joblib).
+    """
+    schema = _load_schema()
+    cols = schema["features"]
+    X = pd.DataFrame([{c: features[c] for c in cols}])
+    return float(py_model.predict_proba(X)[:, 1][0])
+def _predict_r(features: Dict[str, Any]) -> float:
+    """
+    Predict cancellation probability using saved R glmnet model.
+    Uses Rscript subprocess. Requires R installed in Docker image.
+    """
+    if not R_MODEL_PATH.exists():
+        raise FileNotFoundError("R model not found. Run the R notebook first.")
+    if not DATASET_PATH.exists():
+        raise FileNotFoundError("hotel_cancel_model_dataset.csv not found. Run the Data Creation notebook first.")
+    if not R_METRICS_PATH.exists():
+        raise FileNotFoundError("R metrics not found. Run the R notebook first.")
+    # Write input to temp file
+    tmp_input = BASE_DIR / "tmp_r_input.json"
+    with open(tmp_input, "w", encoding="utf-8") as f:
+        json.dump(features, f)
+    r_script = f"""
+    suppressPackageStartupMessages(library(jsonlite))
+    suppressPackageStartupMessages(library(glmnet))
+    suppressPackageStartupMessages(library(Matrix))
+    dataset_path <- "{DATASET_PATH.as_posix()}"
+    features_path <- "{FEATURES_PATH.as_posix()}"
+    model_path <- "{R_MODEL_PATH.as_posix()}"
+    metrics_path <- "{R_METRICS_PATH.as_posix()}"
+    input_path <- "{tmp_input.as_posix()}"
+    df <- read.csv(dataset_path, stringsAsFactors = FALSE)
+    schema <- fromJSON(features_path)
+    FEATURES <- schema$features
+    metrics <- fromJSON(metrics_path)
+    lambda_1se <- metrics$lambda_1se
+    fit <- readRDS(model_path)
+    inp <- fromJSON(input_path)
+    x_df <- as.data.frame(inp, stringsAsFactors = FALSE)
+    for (c in FEATURES) {{
+      if (is.null(x_df[[c]])) stop(paste("Missing input feature:", c))
+      if (is.character(df[[c]]) || is.character(x_df[[c]])) {{
+        levs <- unique(df[[c]])
+        x_df[[c]] <- factor(x_df[[c]], levels = levs)
+      }}
+    }}
+    f <- as.formula(paste("~", paste(FEATURES, collapse = " + ")))
+    X <- sparse.model.matrix(f, data = x_df)[, -1, drop = FALSE]
+    p <- as.numeric(predict(fit, newx = X, s = lambda_1se, type = "response"))[1]
+    cat(p)
+    """
+    proc = subprocess.run(["Rscript", "-e", r_script], capture_output=True, text=True)
+    # Cleanup temp file
+    try:
+        tmp_input.unlink(missing_ok=True)
+    except Exception:
+        pass
+    if proc.returncode != 0:
+        raise RuntimeError(f"R prediction failed:\n{proc.stderr}")
+    try:
+        return float(proc.stdout.strip())
+    except ValueError:
+        raise RuntimeError(f"Could not parse R output as float.\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}")
+def predict_both(
+    lead_time: float,
+    average_price: float,
+    total_nights: float,
+    total_guests: float,
+    market_segment_type: str,
+    type_of_meal: str,
+    special_requests: float,
+    price_per_guest: float,
+):
+    """
+    Gradio callback: predict with both models.
+    """
+    features = {
+        "lead_time": float(lead_time),
+        "average_price": float(average_price),
+        "total_nights": float(total_nights),
+        "total_guests": float(total_guests),
+        "market_segment_type": str(market_segment_type),
+        "type_of_meal": str(type_of_meal),
+        "special_requests": float(special_requests),
+        "price_per_guest": float(price_per_guest),
+    }
+    # Python model prediction
+    if not PY_MODEL_PATH.exists():
+        raise FileNotFoundError("Python model not found. Run the Python notebook first.")
+    py_model = joblib.load(PY_MODEL_PATH)
+    py_proba = _predict_python(py_model, features)
+    # R model prediction
+    r_proba = _predict_r(features)
+    py_text = f"Python (Random Forest) cancellation probability: **{py_proba:.3f}**"
+    r_text = f"R (LASSO Logistic Regression) cancellation probability: **{r_proba:.3f}**"
+    comp_df = pd.DataFrame(
+        [
+            {"model": "Python Random Forest", "p_cancel": py_proba},
+            {"model": "R LASSO Logistic Regression", "p_cancel": r_proba},
+        ]
+    )
+    return py_text, r_text, comp_df
+# ============================================================
+# 4) Dropdown choices (from dataset categories)
+# ============================================================
+def get_dropdown_choices():
+    """
+    Populate dropdown choices from the dataset (so categories match training).
+    If dataset isn't available yet, return fallback defaults.
+    """
+    if not DATASET_PATH.exists():
+        return (["Online", "Offline", "Corporate"], ["Meal Plan 1", "Meal Plan 2", "Not Selected"])
+    df = pd.read_csv(DATASET_PATH)
+    market_choices = sorted(df["market_segment_type"].dropna().unique().tolist())
+    meal_choices = sorted(df["type_of_meal"].dropna().unique().tolist())
+    return market_choices, meal_choices
+# ============================================================
+# 5) Build Gradio UI (3 tabs)
+# ============================================================
+with gr.Blocks(title="Hotel Booking Cancellation Prediction") as demo:
+    gr.Markdown(
+        """
+        # 🏨 Hotel Booking Cancellation Prediction
+        This app runs the full pipeline and compares two models:
+        - **Python Random Forest**
+        - **R LASSO Logistic Regression (glmnet)**
+        **Tabs**
+        1) Run Pipeline + Logs
+        2) Results & Visualizations
+        3) Predict Cancellation Probability (both models)
+        """
+    )
+    # -----------------------------
+    # TAB 1: Run Pipeline + Logs
+    # -----------------------------
+    with gr.Tab("1) Run Pipeline"):
+        gr.Markdown("Run each step and inspect the execution logs.")
+        with gr.Row():
+            btn_data = gr.Button("Run 1) Data Creation")
+            btn_py = gr.Button("Run 2) Python Analysis")
+            btn_r = gr.Button("Run 3) R Analysis")
+            btn_all = gr.Button("Run All (1→2→3)")
+        log_box = gr.Textbox(
+            label="Execution Log",
+            lines=22,
+            value="Click a button to run a step. Logs will appear here.",
+        )
+        btn_data.click(fn=run_data_prep, outputs=log_box)
+        btn_py.click(fn=run_python_model, outputs=log_box)
+        btn_r.click(fn=run_r_model, outputs=log_box)
+        btn_all.click(fn=run_all, outputs=log_box)
+    # -----------------------------
+    # TAB 2: Results & Visualizations
+    # -----------------------------
+    with gr.Tab("2) Results & Visualizations"):
+        gr.Markdown("Loads the latest saved artifacts from **artifacts/py/** and **artifacts/r/**.")
+        btn_refresh = gr.Button("Refresh Results")
+        with gr.Row():
+            py_metrics_view = gr.JSON(label="Python Metrics (metrics.json)")
+            r_metrics_view = gr.JSON(label="R Metrics (metrics.json)")
+        with gr.Row():
+            py_conf_img = gr.Image(label="Python Confusion Matrix", type="filepath")
+            py_roc_img = gr.Image(label="Python ROC Curve", type="filepath")
+            r_roc_img = gr.Image(label="R ROC Curve", type="filepath")
+        with gr.Row():
+            py_fi_table = gr.Dataframe(label="Python Feature Importances (top)", interactive=False)
+            r_coef_table = gr.Dataframe(label="R Coefficients (top)", interactive=False)
+        with gr.Row():
+            py_pred_table = gr.Dataframe(label="Python Test Predictions (top 50)", interactive=False)
+            r_pred_table = gr.Dataframe(label="R Test Predictions (top 50)", interactive=False)
+        def _refresh():
+            return load_results()
+        btn_refresh.click(
+            fn=_refresh,
+            outputs=[
+                py_metrics_view, r_metrics_view,
+                py_conf_img, py_roc_img, r_roc_img,
+                py_fi_table, r_coef_table,
+                py_pred_table, r_pred_table,
+            ],
+        )
+    # -----------------------------
+    # TAB 3: Predict
+    # -----------------------------
+    with gr.Tab("3) Predict"):
+        gr.Markdown(
+            "Enter booking details and predict cancellation probability with **both models**.\n"
+            "Dropdown values are taken from the dataset categories."
+        )
+        market_choices, meal_choices = get_dropdown_choices()
+        with gr.Row():
+            lead_time = gr.Number(label="lead_time", value=30)
+            average_price = gr.Number(label="average_price", value=100)
+        with gr.Row():
+            total_nights = gr.Number(label="total_nights", value=3)
+            total_guests = gr.Number(label="total_guests", value=2)
+        with gr.Row():
+            market_segment_type = gr.Dropdown(
+                label="market_segment_type",
+                choices=market_choices,
+                value=market_choices[0] if market_choices else None,
+            )
+            type_of_meal = gr.Dropdown(
+                label="type_of_meal",
+                choices=meal_choices,
+                value=meal_choices[0] if meal_choices else None,
+            )
+        with gr.Row():
+            special_requests = gr.Number(label="special_requests", value=1)
+            price_per_guest = gr.Number(label="price_per_guest", value=50)
+        btn_predict = gr.Button("Predict Cancellation Probability")
+        py_pred_text = gr.Markdown()
+        r_pred_text = gr.Markdown()
+        comp_table = gr.Dataframe(label="Model Comparison", interactive=False)
+        btn_predict.click(
+            fn=predict_both,
+            inputs=[
+                lead_time, average_price,
+                total_nights, total_guests,
+                market_segment_type, type_of_meal,
+                special_requests, price_per_guest,
+            ],
+            outputs=[py_pred_text, r_pred_text, comp_table],
+        )
+# ============================================================
+# 6) Launch
+# ============================================================
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)