rohan965 commited on
Commit
ef3bd7e
·
verified ·
1 Parent(s): d3e5881

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +448 -0
app.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # ============================================================
3
+ # Hugging Face Docker Space (Gradio) - Hotel Cancellation Project
4
+ # 3 Tabs:
5
+ # 1) Run Pipeline + Execution Logs
6
+ # 2) Results + Visualizations (Python + R)
7
+ # 3) Predict Cancellation Probability (Python RF + R LASSO)
8
+ #
9
+ # Repo must contain:
10
+ # booking.csv
11
+ # 1_Data_Creation.ipynb
12
+ # 2_Python_Analysis.ipynb
13
+ # 3_R_Analysis.ipynb
14
+ # requirements.txt
15
+ # Dockerfile (installs R + IRkernel + needed R packages)
16
+ #
17
+ # Generated by notebooks:
18
+ # hotel_cancel_model_dataset.csv, features.json, dataset_meta.json, train.csv, test.csv
19
+ # artifacts/py/... and artifacts/r/...
20
+ # ============================================================
21
+
22
+ import json
23
+ import os
24
+ import subprocess
25
+ from pathlib import Path
26
+ from typing import Dict, Any, Tuple, Optional
27
+
28
+ import pandas as pd
29
+ import gradio as gr
30
+ import joblib
31
+
32
+ # ============================================================
33
+ # 0) Config (YOUR notebook filenames)
34
+ # ============================================================
35
+
36
+ BASE_DIR = Path.cwd()
37
+
38
+ DATA_NOTEBOOK = "1_Data_Creation.ipynb"
39
+ PY_NOTEBOOK = "2_Python_Analysis.ipynb"
40
+ R_NOTEBOOK = "3_R_Analysis.ipynb"
41
+
42
+ RUNS_DIR = BASE_DIR / "runs"
43
+ RUNS_DIR.mkdir(exist_ok=True)
44
+
45
+ DATASET_PATH = BASE_DIR / "hotel_cancel_model_dataset.csv"
46
+ FEATURES_PATH = BASE_DIR / "features.json"
47
+
48
+ PY_MODEL_PATH = BASE_DIR / "artifacts" / "py" / "models" / "model.joblib"
49
+ R_MODEL_PATH = BASE_DIR / "artifacts" / "r" / "models" / "model.rds"
50
+ R_METRICS_PATH = BASE_DIR / "artifacts" / "r" / "metrics" / "metrics.json"
51
+
52
+ # ============================================================
53
+ # 1) Notebook execution helpers
54
+ # ============================================================
55
+
56
+ def _run_notebook(nb_name: str, out_name: str) -> str:
57
+ """
58
+ Execute a notebook using papermill and return a log string.
59
+ """
60
+ nb_in = BASE_DIR / nb_name
61
+ nb_out = RUNS_DIR / out_name
62
+
63
+ if not nb_in.exists():
64
+ return f"❌ Notebook not found: {nb_in}\nCheck the filename in app.py."
65
+
66
+ # Choose kernel
67
+ # - Python notebooks: python3
68
+ # - R notebook: ir (installed via IRkernel in Dockerfile)
69
+ kernel = "python3"
70
+ if nb_name == R_NOTEBOOK:
71
+ kernel = os.environ.get("R_KERNEL_NAME", "ir")
72
+
73
+ cmd = ["papermill", str(nb_in), str(nb_out), "-k", kernel]
74
+
75
+ try:
76
+ proc = subprocess.run(cmd, capture_output=True, text=True, check=False)
77
+ parts = []
78
+ parts.append(f"▶ Running: {nb_name}")
79
+ parts.append(f"▶ Kernel : {kernel}")
80
+ parts.append(f"▶ Output : {nb_out.name}")
81
+ parts.append("")
82
+ if proc.stdout:
83
+ parts.append("----- STDOUT -----")
84
+ parts.append(proc.stdout)
85
+ if proc.stderr:
86
+ parts.append("----- STDERR -----")
87
+ parts.append(proc.stderr)
88
+ parts.append("")
89
+ parts.append(f"✅ Return code: {proc.returncode}")
90
+ return "\n".join(parts)
91
+ except Exception as e:
92
+ return f"❌ Failed to execute {nb_name}: {repr(e)}"
93
+
94
+
95
+ def run_data_prep() -> str:
96
+ return _run_notebook(DATA_NOTEBOOK, "1_Data_Creation_RUN.ipynb")
97
+
98
+
99
+ def run_python_model() -> str:
100
+ return _run_notebook(PY_NOTEBOOK, "2_Python_Analysis_RUN.ipynb")
101
+
102
+
103
+ def run_r_model() -> str:
104
+ return _run_notebook(R_NOTEBOOK, "3_R_Analysis_RUN.ipynb")
105
+
106
+
107
+ def run_all() -> str:
108
+ logs = []
109
+ logs.append(run_data_prep())
110
+ logs.append("\n" + "=" * 80 + "\n")
111
+ logs.append(run_python_model())
112
+ logs.append("\n" + "=" * 80 + "\n")
113
+ logs.append(run_r_model())
114
+ return "\n".join(logs)
115
+
116
+ # ============================================================
117
+ # 2) Safe file readers for Results tab
118
+ # ============================================================
119
+
120
+ def _safe_read_json(path: Path) -> Optional[Dict[str, Any]]:
121
+ if not path.exists():
122
+ return None
123
+ try:
124
+ with open(path, "r", encoding="utf-8") as f:
125
+ return json.load(f)
126
+ except Exception:
127
+ return None
128
+
129
+
130
+ def _safe_read_csv(path: Path, nrows: Optional[int] = None) -> Optional[pd.DataFrame]:
131
+ if not path.exists():
132
+ return None
133
+ try:
134
+ return pd.read_csv(path, nrows=nrows)
135
+ except Exception:
136
+ return None
137
+
138
+
139
+ def load_results():
140
+ """
141
+ Load latest artifacts from artifacts/py and artifacts/r.
142
+ Returns values in the order used by the Gradio outputs.
143
+ """
144
+ # Python artifacts
145
+ py_metrics = _safe_read_json(BASE_DIR / "artifacts" / "py" / "metrics" / "metrics.json") or {}
146
+ py_conf = str(BASE_DIR / "artifacts" / "py" / "figures" / "confusion_matrix.png")
147
+ py_roc = str(BASE_DIR / "artifacts" / "py" / "figures" / "roc_curve.png")
148
+ py_fi = _safe_read_csv(BASE_DIR / "artifacts" / "py" / "tables" / "feature_importances.csv") or pd.DataFrame()
149
+ py_pred = _safe_read_csv(BASE_DIR / "artifacts" / "py" / "tables" / "test_predictions.csv", nrows=50) or pd.DataFrame()
150
+
151
+ # R artifacts
152
+ r_metrics = _safe_read_json(BASE_DIR / "artifacts" / "r" / "metrics" / "metrics.json") or {}
153
+ r_roc = str(BASE_DIR / "artifacts" / "r" / "figures" / "roc_curve.png")
154
+ r_coef = _safe_read_csv(BASE_DIR / "artifacts" / "r" / "tables" / "coefficients.csv", nrows=50) or pd.DataFrame()
155
+ r_pred = _safe_read_csv(BASE_DIR / "artifacts" / "r" / "tables" / "test_predictions.csv", nrows=50) or pd.DataFrame()
156
+
157
+ return py_metrics, r_metrics, py_conf, py_roc, r_roc, py_fi, r_coef, py_pred, r_pred
158
+
159
+ # ============================================================
160
+ # 3) Prediction (Python + R)
161
+ # ============================================================
162
+
163
+ def _load_schema() -> Dict[str, Any]:
164
+ if not FEATURES_PATH.exists():
165
+ raise FileNotFoundError("features.json not found. Run the Data Creation notebook first.")
166
+ with open(FEATURES_PATH, "r", encoding="utf-8") as f:
167
+ return json.load(f)
168
+
169
+
170
+ def _predict_python(py_model, features: Dict[str, Any]) -> float:
171
+ """
172
+ Predict cancellation probability using sklearn pipeline (joblib).
173
+ """
174
+ schema = _load_schema()
175
+ cols = schema["features"]
176
+ X = pd.DataFrame([{c: features[c] for c in cols}])
177
+ return float(py_model.predict_proba(X)[:, 1][0])
178
+
179
+
180
+ def _predict_r(features: Dict[str, Any]) -> float:
181
+ """
182
+ Predict cancellation probability using saved R glmnet model.
183
+ Uses Rscript subprocess. Requires R installed in Docker image.
184
+ """
185
+ if not R_MODEL_PATH.exists():
186
+ raise FileNotFoundError("R model not found. Run the R notebook first.")
187
+ if not DATASET_PATH.exists():
188
+ raise FileNotFoundError("hotel_cancel_model_dataset.csv not found. Run the Data Creation notebook first.")
189
+ if not R_METRICS_PATH.exists():
190
+ raise FileNotFoundError("R metrics not found. Run the R notebook first.")
191
+
192
+ # Write input to temp file
193
+ tmp_input = BASE_DIR / "tmp_r_input.json"
194
+ with open(tmp_input, "w", encoding="utf-8") as f:
195
+ json.dump(features, f)
196
+
197
+ r_script = f"""
198
+ suppressPackageStartupMessages(library(jsonlite))
199
+ suppressPackageStartupMessages(library(glmnet))
200
+ suppressPackageStartupMessages(library(Matrix))
201
+
202
+ dataset_path <- "{DATASET_PATH.as_posix()}"
203
+ features_path <- "{FEATURES_PATH.as_posix()}"
204
+ model_path <- "{R_MODEL_PATH.as_posix()}"
205
+ metrics_path <- "{R_METRICS_PATH.as_posix()}"
206
+ input_path <- "{tmp_input.as_posix()}"
207
+
208
+ df <- read.csv(dataset_path, stringsAsFactors = FALSE)
209
+ schema <- fromJSON(features_path)
210
+ FEATURES <- schema$features
211
+
212
+ metrics <- fromJSON(metrics_path)
213
+ lambda_1se <- metrics$lambda_1se
214
+
215
+ fit <- readRDS(model_path)
216
+ inp <- fromJSON(input_path)
217
+ x_df <- as.data.frame(inp, stringsAsFactors = FALSE)
218
+
219
+ for (c in FEATURES) {{
220
+ if (is.null(x_df[[c]])) stop(paste("Missing input feature:", c))
221
+ if (is.character(df[[c]]) || is.character(x_df[[c]])) {{
222
+ levs <- unique(df[[c]])
223
+ x_df[[c]] <- factor(x_df[[c]], levels = levs)
224
+ }}
225
+ }}
226
+
227
+ f <- as.formula(paste("~", paste(FEATURES, collapse = " + ")))
228
+ X <- sparse.model.matrix(f, data = x_df)[, -1, drop = FALSE]
229
+ p <- as.numeric(predict(fit, newx = X, s = lambda_1se, type = "response"))[1]
230
+ cat(p)
231
+ """
232
+
233
+ proc = subprocess.run(["Rscript", "-e", r_script], capture_output=True, text=True)
234
+
235
+ # Cleanup temp file
236
+ try:
237
+ tmp_input.unlink(missing_ok=True)
238
+ except Exception:
239
+ pass
240
+
241
+ if proc.returncode != 0:
242
+ raise RuntimeError(f"R prediction failed:\n{proc.stderr}")
243
+
244
+ try:
245
+ return float(proc.stdout.strip())
246
+ except ValueError:
247
+ raise RuntimeError(f"Could not parse R output as float.\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}")
248
+
249
+
250
+ def predict_both(
251
+ lead_time: float,
252
+ average_price: float,
253
+ total_nights: float,
254
+ total_guests: float,
255
+ market_segment_type: str,
256
+ type_of_meal: str,
257
+ special_requests: float,
258
+ price_per_guest: float,
259
+ ):
260
+ """
261
+ Gradio callback: predict with both models.
262
+ """
263
+ features = {
264
+ "lead_time": float(lead_time),
265
+ "average_price": float(average_price),
266
+ "total_nights": float(total_nights),
267
+ "total_guests": float(total_guests),
268
+ "market_segment_type": str(market_segment_type),
269
+ "type_of_meal": str(type_of_meal),
270
+ "special_requests": float(special_requests),
271
+ "price_per_guest": float(price_per_guest),
272
+ }
273
+
274
+ # Python model prediction
275
+ if not PY_MODEL_PATH.exists():
276
+ raise FileNotFoundError("Python model not found. Run the Python notebook first.")
277
+ py_model = joblib.load(PY_MODEL_PATH)
278
+ py_proba = _predict_python(py_model, features)
279
+
280
+ # R model prediction
281
+ r_proba = _predict_r(features)
282
+
283
+ py_text = f"Python (Random Forest) cancellation probability: **{py_proba:.3f}**"
284
+ r_text = f"R (LASSO Logistic Regression) cancellation probability: **{r_proba:.3f}**"
285
+
286
+ comp_df = pd.DataFrame(
287
+ [
288
+ {"model": "Python Random Forest", "p_cancel": py_proba},
289
+ {"model": "R LASSO Logistic Regression", "p_cancel": r_proba},
290
+ ]
291
+ )
292
+
293
+ return py_text, r_text, comp_df
294
+
295
+ # ============================================================
296
+ # 4) Dropdown choices (from dataset categories)
297
+ # ============================================================
298
+
299
+ def get_dropdown_choices():
300
+ """
301
+ Populate dropdown choices from the dataset (so categories match training).
302
+ If dataset isn't available yet, return fallback defaults.
303
+ """
304
+ if not DATASET_PATH.exists():
305
+ return (["Online", "Offline", "Corporate"], ["Meal Plan 1", "Meal Plan 2", "Not Selected"])
306
+
307
+ df = pd.read_csv(DATASET_PATH)
308
+ market_choices = sorted(df["market_segment_type"].dropna().unique().tolist())
309
+ meal_choices = sorted(df["type_of_meal"].dropna().unique().tolist())
310
+ return market_choices, meal_choices
311
+
312
+ # ============================================================
313
+ # 5) Build Gradio UI (3 tabs)
314
+ # ============================================================
315
+
316
+ with gr.Blocks(title="Hotel Booking Cancellation Prediction") as demo:
317
+ gr.Markdown(
318
+ """
319
+ # 🏨 Hotel Booking Cancellation Prediction
320
+ This app runs the full pipeline and compares two models:
321
+ - **Python Random Forest**
322
+ - **R LASSO Logistic Regression (glmnet)**
323
+
324
+ **Tabs**
325
+ 1) Run Pipeline + Logs
326
+ 2) Results & Visualizations
327
+ 3) Predict Cancellation Probability (both models)
328
+ """
329
+ )
330
+
331
+ # -----------------------------
332
+ # TAB 1: Run Pipeline + Logs
333
+ # -----------------------------
334
+ with gr.Tab("1) Run Pipeline"):
335
+ gr.Markdown("Run each step and inspect the execution logs.")
336
+
337
+ with gr.Row():
338
+ btn_data = gr.Button("Run 1) Data Creation")
339
+ btn_py = gr.Button("Run 2) Python Analysis")
340
+ btn_r = gr.Button("Run 3) R Analysis")
341
+ btn_all = gr.Button("Run All (1→2→3)")
342
+
343
+ log_box = gr.Textbox(
344
+ label="Execution Log",
345
+ lines=22,
346
+ value="Click a button to run a step. Logs will appear here.",
347
+ )
348
+
349
+ btn_data.click(fn=run_data_prep, outputs=log_box)
350
+ btn_py.click(fn=run_python_model, outputs=log_box)
351
+ btn_r.click(fn=run_r_model, outputs=log_box)
352
+ btn_all.click(fn=run_all, outputs=log_box)
353
+
354
+ # -----------------------------
355
+ # TAB 2: Results & Visualizations
356
+ # -----------------------------
357
+ with gr.Tab("2) Results & Visualizations"):
358
+ gr.Markdown("Loads the latest saved artifacts from **artifacts/py/** and **artifacts/r/**.")
359
+
360
+ btn_refresh = gr.Button("Refresh Results")
361
+
362
+ with gr.Row():
363
+ py_metrics_view = gr.JSON(label="Python Metrics (metrics.json)")
364
+ r_metrics_view = gr.JSON(label="R Metrics (metrics.json)")
365
+
366
+ with gr.Row():
367
+ py_conf_img = gr.Image(label="Python Confusion Matrix", type="filepath")
368
+ py_roc_img = gr.Image(label="Python ROC Curve", type="filepath")
369
+ r_roc_img = gr.Image(label="R ROC Curve", type="filepath")
370
+
371
+ with gr.Row():
372
+ py_fi_table = gr.Dataframe(label="Python Feature Importances (top)", interactive=False)
373
+ r_coef_table = gr.Dataframe(label="R Coefficients (top)", interactive=False)
374
+
375
+ with gr.Row():
376
+ py_pred_table = gr.Dataframe(label="Python Test Predictions (top 50)", interactive=False)
377
+ r_pred_table = gr.Dataframe(label="R Test Predictions (top 50)", interactive=False)
378
+
379
+ def _refresh():
380
+ return load_results()
381
+
382
+ btn_refresh.click(
383
+ fn=_refresh,
384
+ outputs=[
385
+ py_metrics_view, r_metrics_view,
386
+ py_conf_img, py_roc_img, r_roc_img,
387
+ py_fi_table, r_coef_table,
388
+ py_pred_table, r_pred_table,
389
+ ],
390
+ )
391
+
392
+ # -----------------------------
393
+ # TAB 3: Predict
394
+ # -----------------------------
395
+ with gr.Tab("3) Predict"):
396
+ gr.Markdown(
397
+ "Enter booking details and predict cancellation probability with **both models**.\n"
398
+ "Dropdown values are taken from the dataset categories."
399
+ )
400
+
401
+ market_choices, meal_choices = get_dropdown_choices()
402
+
403
+ with gr.Row():
404
+ lead_time = gr.Number(label="lead_time", value=30)
405
+ average_price = gr.Number(label="average_price", value=100)
406
+
407
+ with gr.Row():
408
+ total_nights = gr.Number(label="total_nights", value=3)
409
+ total_guests = gr.Number(label="total_guests", value=2)
410
+
411
+ with gr.Row():
412
+ market_segment_type = gr.Dropdown(
413
+ label="market_segment_type",
414
+ choices=market_choices,
415
+ value=market_choices[0] if market_choices else None,
416
+ )
417
+ type_of_meal = gr.Dropdown(
418
+ label="type_of_meal",
419
+ choices=meal_choices,
420
+ value=meal_choices[0] if meal_choices else None,
421
+ )
422
+
423
+ with gr.Row():
424
+ special_requests = gr.Number(label="special_requests", value=1)
425
+ price_per_guest = gr.Number(label="price_per_guest", value=50)
426
+
427
+ btn_predict = gr.Button("Predict Cancellation Probability")
428
+
429
+ py_pred_text = gr.Markdown()
430
+ r_pred_text = gr.Markdown()
431
+ comp_table = gr.Dataframe(label="Model Comparison", interactive=False)
432
+
433
+ btn_predict.click(
434
+ fn=predict_both,
435
+ inputs=[
436
+ lead_time, average_price,
437
+ total_nights, total_guests,
438
+ market_segment_type, type_of_meal,
439
+ special_requests, price_per_guest,
440
+ ],
441
+ outputs=[py_pred_text, r_pred_text, comp_table],
442
+ )
443
+
444
+ # ============================================================
445
+ # 6) Launch
446
+ # ============================================================
447
+ if __name__ == "__main__":
448
+ demo.launch(server_name="0.0.0.0", server_port=7860)