rohan965 commited on
Commit
5f338ab
·
verified ·
1 Parent(s): 3aabff9

Upload 4 files

Browse files
Files changed (4) hide show
  1. 1_Data_Creation.ipynb +288 -0
  2. 2_Python_Analysis.ipynb +396 -0
  3. 3_R_Analysis.ipynb +399 -0
  4. booking.csv +0 -0
1_Data_Creation.ipynb ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {
21
+ "colab": {
22
+ "base_uri": "https://localhost:8080/"
23
+ },
24
+ "id": "OnYkKEUJVrSc",
25
+ "outputId": "cd7f7280-d1b1-479d-94ac-a14fb1615d28"
26
+ },
27
+ "outputs": [
28
+ {
29
+ "output_type": "stream",
30
+ "name": "stdout",
31
+ "text": [
32
+ "Raw shape: (36285, 17)\n",
33
+ "Raw columns: ['Booking_ID', 'number of adults', 'number of children', 'number of weekend nights', 'number of week nights', 'type of meal', 'car parking space', 'room type', 'lead time', 'market segment type', 'repeated', 'P-C', 'P-not-C', 'average price', 'special requests', 'date of reservation', 'booking status']\n",
34
+ "\n",
35
+ "Columns after renaming: ['booking_id', 'number_of_adults', 'number_of_children', 'number_of_weekend_nights', 'number_of_week_nights', 'type_of_meal', 'car_parking_space', 'room_type', 'lead_time', 'market_segment_type', 'repeated', 'p_c', 'p_not_c', 'average_price', 'special_requests', 'date_of_reservation', 'booking_status']\n",
36
+ "\n",
37
+ "Cleaned shape: (36285, 18)\n",
38
+ "Overall cancel rate: 0.3277\n",
39
+ "\n",
40
+ "Split distribution:\n",
41
+ "split\n",
42
+ "train 29028\n",
43
+ "test 7257\n",
44
+ "Name: count, dtype: int64\n",
45
+ "Train cancel rate: 0.3276\n",
46
+ "Test cancel rate : 0.3277\n",
47
+ "\n",
48
+ "✅ Saved: hotel_cancel_model_dataset.csv | shape=(36285, 11)\n",
49
+ "✅ Saved: train.csv, test.csv\n",
50
+ "✅ Saved: features.json\n",
51
+ "✅ Saved: dataset_meta.json\n",
52
+ "\n",
53
+ "Data preparation completed successfully.\n"
54
+ ]
55
+ }
56
+ ],
57
+ "source": [
58
+ "# ============================================================\n",
59
+ "# Hotel Booking Cancellation - Data Preparation Notebook\n",
60
+ "# ============================================================\n",
61
+ "# This notebook:\n",
62
+ "# 1) Loads booking.csv (from HF repo root)\n",
63
+ "# 2) Cleans and standardizes columns\n",
64
+ "# 3) Engineers EXACTLY 8 modeling features\n",
65
+ "# 4) Creates target variable (is_canceled)\n",
66
+ "# 5) Creates fixed stratified train/test split column\n",
67
+ "# 6) Exports dataset + metadata for Python & R notebooks\n",
68
+ "# ============================================================\n",
69
+ "\n",
70
+ "import json\n",
71
+ "import hashlib\n",
72
+ "from pathlib import Path\n",
73
+ "\n",
74
+ "import pandas as pd\n",
75
+ "import numpy as np\n",
76
+ "\n",
77
+ "# ============================================================\n",
78
+ "# 1) Load raw dataset (booking.csv must be in repo root)\n",
79
+ "# ============================================================\n",
80
+ "\n",
81
+ "BASE_DIR = Path.cwd()\n",
82
+ "FILE_PATH = BASE_DIR / \"booking.csv\"\n",
83
+ "\n",
84
+ "if not FILE_PATH.exists():\n",
85
+ " raise FileNotFoundError(\n",
86
+ " f\"booking.csv not found in {BASE_DIR}. \"\n",
87
+ " \"Make sure the file is uploaded to the Hugging Face repo root.\"\n",
88
+ " )\n",
89
+ "\n",
90
+ "df_raw = pd.read_csv(FILE_PATH)\n",
91
+ "\n",
92
+ "print(\"Raw shape:\", df_raw.shape)\n",
93
+ "print(\"Raw columns:\", df_raw.columns.tolist())\n",
94
+ "\n",
95
+ "# ============================================================\n",
96
+ "# 2) Standardize column names to snake_case\n",
97
+ "# ============================================================\n",
98
+ "\n",
99
+ "def to_snake(s: str) -> str:\n",
100
+ " s = str(s).strip().lower()\n",
101
+ " s = s.replace(\"-\", \"_\").replace(\" \", \"_\")\n",
102
+ " s = \"\".join([c if (c.isalnum() or c == \"_\") else \"_\" for c in s])\n",
103
+ " s = \"_\".join([x for x in s.split(\"_\") if x])\n",
104
+ " return s\n",
105
+ "\n",
106
+ "df = df_raw.copy()\n",
107
+ "df.columns = [to_snake(c) for c in df.columns]\n",
108
+ "\n",
109
+ "print(\"\\nColumns after renaming:\", df.columns.tolist())\n",
110
+ "\n",
111
+ "# ============================================================\n",
112
+ "# 3) Basic cleaning\n",
113
+ "# ============================================================\n",
114
+ "\n",
115
+ "# Numeric columns present in this Kaggle dataset\n",
116
+ "numeric_cols = [\n",
117
+ " \"number_of_adults\",\n",
118
+ " \"number_of_children\",\n",
119
+ " \"number_of_weekend_nights\",\n",
120
+ " \"number_of_week_nights\",\n",
121
+ " \"lead_time\",\n",
122
+ " \"average_price\",\n",
123
+ " \"special_requests\",\n",
124
+ " \"car_parking_space\",\n",
125
+ " \"repeated\",\n",
126
+ "]\n",
127
+ "\n",
128
+ "categorical_cols = [\n",
129
+ " \"market_segment_type\",\n",
130
+ " \"type_of_meal\",\n",
131
+ " \"room_type\", # not used in final model but cleaned\n",
132
+ "]\n",
133
+ "\n",
134
+ "# Convert numeric columns\n",
135
+ "for c in numeric_cols:\n",
136
+ " if c in df.columns:\n",
137
+ " df[c] = pd.to_numeric(df[c], errors=\"coerce\")\n",
138
+ "\n",
139
+ "# Clean categorical columns\n",
140
+ "for c in categorical_cols:\n",
141
+ " if c in df.columns:\n",
142
+ " df[c] = (\n",
143
+ " df[c]\n",
144
+ " .astype(str)\n",
145
+ " .str.strip()\n",
146
+ " .replace({\"nan\": \"unknown\", \"none\": \"unknown\", \"\": \"unknown\"})\n",
147
+ " .fillna(\"unknown\")\n",
148
+ " )\n",
149
+ "\n",
150
+ "# Create target variable\n",
151
+ "if \"booking_status\" not in df.columns:\n",
152
+ " raise ValueError(\"Expected column 'booking_status' not found.\")\n",
153
+ "\n",
154
+ "df[\"booking_status\"] = df[\"booking_status\"].astype(str).str.strip()\n",
155
+ "df[\"is_canceled\"] = (df[\"booking_status\"].str.lower() == \"canceled\").astype(int)\n",
156
+ "\n",
157
+ "# Handle missing numeric values using median\n",
158
+ "for c in numeric_cols:\n",
159
+ " if c in df.columns:\n",
160
+ " df[c] = df[c].fillna(df[c].median())\n",
161
+ "\n",
162
+ "print(\"\\nCleaned shape:\", df.shape)\n",
163
+ "print(\"Overall cancel rate:\", round(df[\"is_canceled\"].mean(), 4))\n",
164
+ "\n",
165
+ "# ============================================================\n",
166
+ "# 4) Feature Engineering (FINAL 8 FEATURES)\n",
167
+ "# ============================================================\n",
168
+ "\n",
169
+ "# 4.1 Total nights\n",
170
+ "df[\"total_nights\"] = (\n",
171
+ " df[\"number_of_weekend_nights\"] + df[\"number_of_week_nights\"]\n",
172
+ ")\n",
173
+ "\n",
174
+ "# 4.2 Total guests\n",
175
+ "df[\"total_guests\"] = (\n",
176
+ " df[\"number_of_adults\"] + df[\"number_of_children\"]\n",
177
+ ")\n",
178
+ "\n",
179
+ "# 4.3 Price per guest (avoid division by zero)\n",
180
+ "denom = np.where(df[\"total_guests\"] > 0, df[\"total_guests\"], 1)\n",
181
+ "df[\"price_per_guest\"] = df[\"average_price\"] / denom\n",
182
+ "\n",
183
+ "# ============================================================\n",
184
+ "# 5) Select EXACT 8 features\n",
185
+ "# ============================================================\n",
186
+ "\n",
187
+ "FINAL_FEATURES = [\n",
188
+ " \"lead_time\",\n",
189
+ " \"average_price\",\n",
190
+ " \"total_nights\",\n",
191
+ " \"total_guests\",\n",
192
+ " \"market_segment_type\",\n",
193
+ " \"type_of_meal\",\n",
194
+ " \"special_requests\",\n",
195
+ " \"price_per_guest\",\n",
196
+ "]\n",
197
+ "\n",
198
+ "missing = [c for c in FINAL_FEATURES if c not in df.columns]\n",
199
+ "if missing:\n",
200
+ " raise ValueError(f\"Missing required feature columns: {missing}\")\n",
201
+ "\n",
202
+ "id_col = \"booking_id\" if \"booking_id\" in df.columns else None\n",
203
+ "\n",
204
+ "export_cols = ([id_col] if id_col else []) + FINAL_FEATURES + [\"is_canceled\"]\n",
205
+ "df_model = df[export_cols].copy()\n",
206
+ "\n",
207
+ "# ============================================================\n",
208
+ "# 6) Create FIXED stratified train/test split\n",
209
+ "# ============================================================\n",
210
+ "\n",
211
+ "rng = np.random.default_rng(42)\n",
212
+ "df_model[\"split\"] = \"train\"\n",
213
+ "\n",
214
+ "for label in [0, 1]:\n",
215
+ " idx = df_model.index[df_model[\"is_canceled\"] == label].to_numpy()\n",
216
+ " rng.shuffle(idx)\n",
217
+ " test_size = int(round(0.2 * len(idx)))\n",
218
+ " test_idx = idx[:test_size]\n",
219
+ " df_model.loc[test_idx, \"split\"] = \"test\"\n",
220
+ "\n",
221
+ "print(\"\\nSplit distribution:\")\n",
222
+ "print(df_model[\"split\"].value_counts())\n",
223
+ "print(\"Train cancel rate:\", round(df_model[df_model[\"split\"] == \"train\"][\"is_canceled\"].mean(), 4))\n",
224
+ "print(\"Test cancel rate :\", round(df_model[df_model[\"split\"] == \"test\"][\"is_canceled\"].mean(), 4))\n",
225
+ "\n",
226
+ "# ============================================================\n",
227
+ "# 7) Export files for Python + R notebooks + HF app\n",
228
+ "# ============================================================\n",
229
+ "\n",
230
+ "OUT_DATASET = \"hotel_cancel_model_dataset.csv\"\n",
231
+ "df_model.to_csv(OUT_DATASET, index=False, encoding=\"utf-8\")\n",
232
+ "print(f\"\\n✅ Saved: {OUT_DATASET} | shape={df_model.shape}\")\n",
233
+ "\n",
234
+ "# Optional convenience splits\n",
235
+ "df_model[df_model[\"split\"] == \"train\"].to_csv(\"train.csv\", index=False)\n",
236
+ "df_model[df_model[\"split\"] == \"test\"].to_csv(\"test.csv\", index=False)\n",
237
+ "print(\"✅ Saved: train.csv, test.csv\")\n",
238
+ "\n",
239
+ "# ============================================================\n",
240
+ "# 8) Export feature schema (for Python model + app)\n",
241
+ "# ============================================================\n",
242
+ "\n",
243
+ "features_payload = {\n",
244
+ " \"id_col\": id_col,\n",
245
+ " \"target\": \"is_canceled\",\n",
246
+ " \"split_col\": \"split\",\n",
247
+ " \"features\": FINAL_FEATURES,\n",
248
+ " \"categorical_features\": [\"market_segment_type\", \"type_of_meal\"],\n",
249
+ " \"numeric_features\": [\n",
250
+ " \"lead_time\",\n",
251
+ " \"average_price\",\n",
252
+ " \"total_nights\",\n",
253
+ " \"total_guests\",\n",
254
+ " \"special_requests\",\n",
255
+ " \"price_per_guest\",\n",
256
+ " ],\n",
257
+ "}\n",
258
+ "\n",
259
+ "with open(\"features.json\", \"w\", encoding=\"utf-8\") as f:\n",
260
+ " json.dump(features_payload, f, indent=2)\n",
261
+ "\n",
262
+ "print(\"✅ Saved: features.json\")\n",
263
+ "\n",
264
+ "# ============================================================\n",
265
+ "# 9) Dataset fingerprint (reproducibility)\n",
266
+ "# ============================================================\n",
267
+ "\n",
268
+ "with open(OUT_DATASET, \"rb\") as f:\n",
269
+ " md5 = hashlib.md5(f.read()).hexdigest()\n",
270
+ "\n",
271
+ "dataset_meta = {\n",
272
+ " \"dataset_file\": OUT_DATASET,\n",
273
+ " \"md5\": md5,\n",
274
+ " \"rows\": int(df_model.shape[0]),\n",
275
+ " \"cols\": int(df_model.shape[1]),\n",
276
+ " \"cancel_rate_overall\": float(df_model[\"is_canceled\"].mean()),\n",
277
+ "}\n",
278
+ "\n",
279
+ "with open(\"dataset_meta.json\", \"w\", encoding=\"utf-8\") as f:\n",
280
+ " json.dump(dataset_meta, f, indent=2)\n",
281
+ "\n",
282
+ "print(\"✅ Saved: dataset_meta.json\")\n",
283
+ "\n",
284
+ "print(\"\\nData preparation completed successfully.\")"
285
+ ]
286
+ }
287
+ ]
288
+ }
2_Python_Analysis.ipynb ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {
21
+ "colab": {
22
+ "base_uri": "https://localhost:8080/"
23
+ },
24
+ "id": "3F3JK2ZTayYg",
25
+ "outputId": "a724f9a3-5603-4db5-9e38-095323467abe"
26
+ },
27
+ "outputs": [
28
+ {
29
+ "output_type": "stream",
30
+ "name": "stdout",
31
+ "text": [
32
+ "Loaded dataset: /content/hotel_cancel_model_dataset.csv\n",
33
+ "Shape: (36285, 11)\n",
34
+ "Features: ['lead_time', 'average_price', 'total_nights', 'total_guests', 'market_segment_type', 'type_of_meal', 'special_requests', 'price_per_guest']\n",
35
+ "Target: is_canceled\n",
36
+ "Split column: split\n",
37
+ "\n",
38
+ "Split distribution:\n",
39
+ "split\n",
40
+ "train 29028\n",
41
+ "test 7257\n",
42
+ "Name: count, dtype: int64\n",
43
+ "Overall cancel rate: 0.3277\n",
44
+ "\n",
45
+ "Train rows: 29028 | Test rows: 7257\n",
46
+ "Train cancel rate: 0.3276\n",
47
+ "Test cancel rate : 0.3277\n",
48
+ "\n",
49
+ "=== Test Metrics (Random Forest) ===\n",
50
+ "Accuracy: 0.8794\n",
51
+ "Precision: 0.8417\n",
52
+ "Recall: 0.7784\n",
53
+ "F1: 0.8088\n",
54
+ "ROC AUC: 0.9382\n",
55
+ "\n",
56
+ "Confusion Matrix:\n",
57
+ " [[4531 348]\n",
58
+ " [ 527 1851]]\n",
59
+ "✅ Saved: /content/artifacts/py/figures/confusion_matrix.png\n",
60
+ "✅ Saved: /content/artifacts/py/figures/roc_curve.png\n",
61
+ "✅ Saved: /content/artifacts/py/metrics/metrics.json\n",
62
+ "✅ Saved: /content/artifacts/py/tables/test_predictions.csv\n",
63
+ "✅ Saved: /content/artifacts/py/tables/feature_importances.csv\n",
64
+ "✅ Saved: /content/artifacts/py/models/model.joblib\n",
65
+ "✅ Saved: /content/artifacts/py/example_input.json\n",
66
+ "\n",
67
+ "All done. Python artifacts saved to: /content/artifacts/py\n",
68
+ "Next: update the R notebook to use the same split + features and output artifacts/r/...\n"
69
+ ]
70
+ }
71
+ ],
72
+ "source": [
73
+ "# ============================================================\n",
74
+ "# Hotel Booking Cancellation - Python Model Notebook (Random Forest)\n",
75
+ "# ============================================================\n",
76
+ "# This notebook/script:\n",
77
+ "# 1) Loads the modeling dataset created in the Data Preparation step:\n",
78
+ "# - hotel_cancel_model_dataset.csv\n",
79
+ "# - features.json\n",
80
+ "# - (optionally) dataset_meta.json\n",
81
+ "# 2) Uses the fixed \"split\" column to train on train rows and evaluate on test rows\n",
82
+ "# 3) Trains a RandomForest model inside a single sklearn Pipeline:\n",
83
+ "# - Numeric: median imputation\n",
84
+ "# - Categorical: most_frequent imputation + OneHotEncode\n",
85
+ "# 4) Exports Hugging Face app–ready artifacts to artifacts/py/:\n",
86
+ "# - model.joblib\n",
87
+ "# - metrics.json\n",
88
+ "# - confusion_matrix.png\n",
89
+ "# - roc_curve.png\n",
90
+ "# - feature_importances.csv\n",
91
+ "# - test_predictions.csv\n",
92
+ "# - example_input.json\n",
93
+ "# ============================================================\n",
94
+ "\n",
95
+ "import json\n",
96
+ "from pathlib import Path\n",
97
+ "\n",
98
+ "import numpy as np\n",
99
+ "import pandas as pd\n",
100
+ "import matplotlib.pyplot as plt\n",
101
+ "\n",
102
+ "from sklearn.compose import ColumnTransformer\n",
103
+ "from sklearn.impute import SimpleImputer\n",
104
+ "from sklearn.pipeline import Pipeline\n",
105
+ "from sklearn.preprocessing import OneHotEncoder\n",
106
+ "from sklearn.ensemble import RandomForestClassifier\n",
107
+ "\n",
108
+ "from sklearn.metrics import (\n",
109
+ " accuracy_score,\n",
110
+ " precision_score,\n",
111
+ " recall_score,\n",
112
+ " f1_score,\n",
113
+ " roc_auc_score,\n",
114
+ " roc_curve,\n",
115
+ " confusion_matrix,\n",
116
+ " classification_report\n",
117
+ ")\n",
118
+ "\n",
119
+ "import joblib\n",
120
+ "\n",
121
+ "# ============================================================\n",
122
+ "# 1) Paths / Inputs\n",
123
+ "# ============================================================\n",
124
+ "\n",
125
+ "BASE_DIR = Path.cwd()\n",
126
+ "\n",
127
+ "DATASET_PATH = BASE_DIR / \"hotel_cancel_model_dataset.csv\"\n",
128
+ "FEATURES_PATH = BASE_DIR / \"features.json\"\n",
129
+ "DATASET_META_PATH = BASE_DIR / \"dataset_meta.json\" # optional\n",
130
+ "\n",
131
+ "if not DATASET_PATH.exists():\n",
132
+ " raise FileNotFoundError(f\"Missing file: {DATASET_PATH}. Run the Data Preparation step first.\")\n",
133
+ "if not FEATURES_PATH.exists():\n",
134
+ " raise FileNotFoundError(f\"Missing file: {FEATURES_PATH}. Run the Data Preparation step first.\")\n",
135
+ "\n",
136
+ "# Output folders for Hugging Face app to display\n",
137
+ "ARTIFACTS_DIR = BASE_DIR / \"artifacts\" / \"py\"\n",
138
+ "FIG_DIR = ARTIFACTS_DIR / \"figures\"\n",
139
+ "METRICS_DIR = ARTIFACTS_DIR / \"metrics\"\n",
140
+ "MODELS_DIR = ARTIFACTS_DIR / \"models\"\n",
141
+ "TABLES_DIR = ARTIFACTS_DIR / \"tables\"\n",
142
+ "\n",
143
+ "for d in [FIG_DIR, METRICS_DIR, MODELS_DIR, TABLES_DIR]:\n",
144
+ " d.mkdir(parents=True, exist_ok=True)\n",
145
+ "\n",
146
+ "# ============================================================\n",
147
+ "# 2) Load dataset + schema\n",
148
+ "# ============================================================\n",
149
+ "\n",
150
+ "df = pd.read_csv(DATASET_PATH)\n",
151
+ "\n",
152
+ "with open(FEATURES_PATH, \"r\", encoding=\"utf-8\") as f:\n",
153
+ " schema = json.load(f)\n",
154
+ "\n",
155
+ "FEATURES = schema[\"features\"]\n",
156
+ "TARGET = schema[\"target\"]\n",
157
+ "SPLIT_COL = schema[\"split_col\"]\n",
158
+ "ID_COL = schema.get(\"id_col\", None)\n",
159
+ "\n",
160
+ "cat_features = schema[\"categorical_features\"]\n",
161
+ "num_features = schema[\"numeric_features\"]\n",
162
+ "\n",
163
+ "print(\"Loaded dataset:\", DATASET_PATH)\n",
164
+ "print(\"Shape:\", df.shape)\n",
165
+ "print(\"Features:\", FEATURES)\n",
166
+ "print(\"Target:\", TARGET)\n",
167
+ "print(\"Split column:\", SPLIT_COL)\n",
168
+ "\n",
169
+ "# Basic validation\n",
170
+ "missing_cols = [c for c in FEATURES + [TARGET, SPLIT_COL] if c not in df.columns]\n",
171
+ "if missing_cols:\n",
172
+ " raise ValueError(f\"Dataset is missing required columns: {missing_cols}\")\n",
173
+ "\n",
174
+ "print(\"\\nSplit distribution:\")\n",
175
+ "print(df[SPLIT_COL].value_counts(dropna=False))\n",
176
+ "print(\"Overall cancel rate:\", round(df[TARGET].mean(), 4))\n",
177
+ "\n",
178
+ "# ============================================================\n",
179
+ "# 3) Train/Test split using the fixed split column\n",
180
+ "# ============================================================\n",
181
+ "\n",
182
+ "train_df = df[df[SPLIT_COL] == \"train\"].copy()\n",
183
+ "test_df = df[df[SPLIT_COL] == \"test\"].copy()\n",
184
+ "\n",
185
+ "if train_df.empty or test_df.empty:\n",
186
+ " raise ValueError(\"Train or test split is empty. Check the 'split' column in the dataset.\")\n",
187
+ "\n",
188
+ "X_train = train_df[FEATURES]\n",
189
+ "y_train = train_df[TARGET].astype(int)\n",
190
+ "\n",
191
+ "X_test = test_df[FEATURES]\n",
192
+ "y_test = test_df[TARGET].astype(int)\n",
193
+ "\n",
194
+ "print(\"\\nTrain rows:\", len(train_df), \"| Test rows:\", len(test_df))\n",
195
+ "print(\"Train cancel rate:\", round(y_train.mean(), 4))\n",
196
+ "print(\"Test cancel rate :\", round(y_test.mean(), 4))\n",
197
+ "\n",
198
+ "# ============================================================\n",
199
+ "# 4) Build preprocessing + model pipeline\n",
200
+ "# ============================================================\n",
201
+ "\n",
202
+ "# Numeric preprocessing: fill missing with median\n",
203
+ "numeric_transformer = Pipeline(steps=[\n",
204
+ " (\"imputer\", SimpleImputer(strategy=\"median\")),\n",
205
+ "])\n",
206
+ "\n",
207
+ "# Categorical preprocessing: fill missing with most frequent, then one-hot encode\n",
208
+ "categorical_transformer = Pipeline(steps=[\n",
209
+ " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n",
210
+ " (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\")),\n",
211
+ "])\n",
212
+ "\n",
213
+ "preprocess = ColumnTransformer(\n",
214
+ " transformers=[\n",
215
+ " (\"num\", numeric_transformer, num_features),\n",
216
+ " (\"cat\", categorical_transformer, cat_features),\n",
217
+ " ],\n",
218
+ " remainder=\"drop\"\n",
219
+ ")\n",
220
+ "\n",
221
+ "# Random Forest model (balanced helps with class imbalance)\n",
222
+ "model = RandomForestClassifier(\n",
223
+ " n_estimators=400,\n",
224
+ " random_state=42,\n",
225
+ " class_weight=\"balanced\",\n",
226
+ " n_jobs=-1\n",
227
+ ")\n",
228
+ "\n",
229
+ "clf = Pipeline(steps=[\n",
230
+ " (\"preprocess\", preprocess),\n",
231
+ " (\"model\", model),\n",
232
+ "])\n",
233
+ "\n",
234
+ "# ============================================================\n",
235
+ "# 5) Train\n",
236
+ "# ============================================================\n",
237
+ "\n",
238
+ "clf.fit(X_train, y_train)\n",
239
+ "\n",
240
+ "# ============================================================\n",
241
+ "# 6) Predict + evaluate\n",
242
+ "# ============================================================\n",
243
+ "\n",
244
+ "# Predicted class\n",
245
+ "y_pred = clf.predict(X_test)\n",
246
+ "\n",
247
+ "# Predicted probability for class \"1\" (canceled)\n",
248
+ "y_proba = clf.predict_proba(X_test)[:, 1]\n",
249
+ "\n",
250
+ "acc = accuracy_score(y_test, y_pred)\n",
251
+ "prec = precision_score(y_test, y_pred, zero_division=0)\n",
252
+ "rec = recall_score(y_test, y_pred, zero_division=0)\n",
253
+ "f1 = f1_score(y_test, y_pred, zero_division=0)\n",
254
+ "\n",
255
+ "# AUC requires probabilities\n",
256
+ "auc = roc_auc_score(y_test, y_proba)\n",
257
+ "\n",
258
+ "cm = confusion_matrix(y_test, y_pred)\n",
259
+ "report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)\n",
260
+ "\n",
261
+ "print(\"\\n=== Test Metrics (Random Forest) ===\")\n",
262
+ "print(\"Accuracy:\", round(acc, 4))\n",
263
+ "print(\"Precision:\", round(prec, 4))\n",
264
+ "print(\"Recall:\", round(rec, 4))\n",
265
+ "print(\"F1:\", round(f1, 4))\n",
266
+ "print(\"ROC AUC:\", round(auc, 4))\n",
267
+ "print(\"\\nConfusion Matrix:\\n\", cm)\n",
268
+ "\n",
269
+ "# ============================================================\n",
270
+ "# 7) Save plots for Hugging Face app\n",
271
+ "# ============================================================\n",
272
+ "\n",
273
+ "# 7.1 Confusion matrix plot\n",
274
+ "plt.figure()\n",
275
+ "plt.imshow(cm)\n",
276
+ "plt.title(\"Confusion Matrix (Random Forest)\")\n",
277
+ "plt.xlabel(\"Predicted\")\n",
278
+ "plt.ylabel(\"Actual\")\n",
279
+ "plt.xticks([0, 1], [\"Not canceled (0)\", \"Canceled (1)\"])\n",
280
+ "plt.yticks([0, 1], [\"Not canceled (0)\", \"Canceled (1)\"])\n",
281
+ "\n",
282
+ "# Add numbers on the matrix\n",
283
+ "for (i, j), v in np.ndenumerate(cm):\n",
284
+ " plt.text(j, i, str(v), ha=\"center\", va=\"center\")\n",
285
+ "\n",
286
+ "conf_path = FIG_DIR / \"confusion_matrix.png\"\n",
287
+ "plt.tight_layout()\n",
288
+ "plt.savefig(conf_path, dpi=200)\n",
289
+ "plt.close()\n",
290
+ "print(\"✅ Saved:\", conf_path)\n",
291
+ "\n",
292
+ "# 7.2 ROC curve plot\n",
293
+ "fpr, tpr, thresholds = roc_curve(y_test, y_proba)\n",
294
+ "\n",
295
+ "plt.figure()\n",
296
+ "plt.plot(fpr, tpr)\n",
297
+ "plt.plot([0, 1], [0, 1], linestyle=\"--\")\n",
298
+ "plt.title(\"ROC Curve (Random Forest)\")\n",
299
+ "plt.xlabel(\"False Positive Rate\")\n",
300
+ "plt.ylabel(\"True Positive Rate\")\n",
301
+ "\n",
302
+ "roc_path = FIG_DIR / \"roc_curve.png\"\n",
303
+ "plt.tight_layout()\n",
304
+ "plt.savefig(roc_path, dpi=200)\n",
305
+ "plt.close()\n",
306
+ "print(\"✅ Saved:\", roc_path)\n",
307
+ "\n",
308
+ "# ============================================================\n",
309
+ "# 8) Save metrics + tables for Hugging Face app\n",
310
+ "# ============================================================\n",
311
+ "\n",
312
+ "metrics_payload = {\n",
313
+ " \"model_name\": \"RandomForestClassifier\",\n",
314
+ " \"features_used\": FEATURES,\n",
315
+ " \"n_train\": int(len(train_df)),\n",
316
+ " \"n_test\": int(len(test_df)),\n",
317
+ " \"cancel_rate_train\": float(y_train.mean()),\n",
318
+ " \"cancel_rate_test\": float(y_test.mean()),\n",
319
+ " \"accuracy\": float(acc),\n",
320
+ " \"precision\": float(prec),\n",
321
+ " \"recall\": float(rec),\n",
322
+ " \"f1\": float(f1),\n",
323
+ " \"roc_auc\": float(auc),\n",
324
+ " \"confusion_matrix\": cm.tolist(),\n",
325
+ " \"classification_report\": report, # full dict\n",
326
+ "}\n",
327
+ "\n",
328
+ "metrics_path = METRICS_DIR / \"metrics.json\"\n",
329
+ "with open(metrics_path, \"w\", encoding=\"utf-8\") as f:\n",
330
+ " json.dump(metrics_payload, f, indent=2)\n",
331
+ "print(\"✅ Saved:\", metrics_path)\n",
332
+ "\n",
333
+ "# Save test predictions table (so app can display top risky bookings)\n",
334
+ "pred_df = test_df.copy()\n",
335
+ "pred_df[\"pred_label\"] = y_pred\n",
336
+ "pred_df[\"pred_proba_canceled\"] = y_proba\n",
337
+ "\n",
338
+ "# Keep id if present, else keep the row index\n",
339
+ "cols_to_keep = []\n",
340
+ "if ID_COL and ID_COL in pred_df.columns:\n",
341
+ " cols_to_keep.append(ID_COL)\n",
342
+ "\n",
343
+ "cols_to_keep += FEATURES + [TARGET, \"pred_label\", \"pred_proba_canceled\"]\n",
344
+ "\n",
345
+ "pred_out = pred_df[cols_to_keep].sort_values(\"pred_proba_canceled\", ascending=False)\n",
346
+ "pred_path = TABLES_DIR / \"test_predictions.csv\"\n",
347
+ "pred_out.to_csv(pred_path, index=False)\n",
348
+ "print(\"✅ Saved:\", pred_path)\n",
349
+ "\n",
350
+ "# ============================================================\n",
351
+ "# 9) Feature importances (mapped back to one-hot feature names)\n",
352
+ "# ============================================================\n",
353
+ "\n",
354
+ "# Extract feature names after preprocessing\n",
355
+ "preprocessor = clf.named_steps[\"preprocess\"]\n",
356
+ "ohe = preprocessor.named_transformers_[\"cat\"].named_steps[\"onehot\"]\n",
357
+ "\n",
358
+ "cat_feature_names = list(ohe.get_feature_names_out(cat_features))\n",
359
+ "all_feature_names = num_features + cat_feature_names\n",
360
+ "\n",
361
+ "rf = clf.named_steps[\"model\"]\n",
362
+ "importances = rf.feature_importances_\n",
363
+ "\n",
364
+ "fi = pd.DataFrame({\n",
365
+ " \"feature\": all_feature_names,\n",
366
+ " \"importance\": importances\n",
367
+ "}).sort_values(\"importance\", ascending=False)\n",
368
+ "\n",
369
+ "fi_path = TABLES_DIR / \"feature_importances.csv\"\n",
370
+ "fi.to_csv(fi_path, index=False)\n",
371
+ "print(\"✅ Saved:\", fi_path)\n",
372
+ "\n",
373
+ "# ============================================================\n",
374
+ "# 10) Save trained model for app inference\n",
375
+ "# ============================================================\n",
376
+ "\n",
377
+ "model_path = MODELS_DIR / \"model.joblib\"\n",
378
+ "joblib.dump(clf, model_path)\n",
379
+ "print(\"✅ Saved:\", model_path)\n",
380
+ "\n",
381
+ "# ============================================================\n",
382
+ "# 11) Save an example input row for the app's \"Predict\" tab\n",
383
+ "# ============================================================\n",
384
+ "\n",
385
+ "example_row = train_df[FEATURES].iloc[0].to_dict()\n",
386
+ "example_path = ARTIFACTS_DIR / \"example_input.json\"\n",
387
+ "with open(example_path, \"w\", encoding=\"utf-8\") as f:\n",
388
+ " json.dump(example_row, f, indent=2)\n",
389
+ "print(\"✅ Saved:\", example_path)\n",
390
+ "\n",
391
+ "print(\"\\nAll done. Python artifacts saved to:\", ARTIFACTS_DIR)\n",
392
+ "print(\"Next: update the R notebook to use the same split + features and output artifacts/r/...\")"
393
+ ]
394
+ }
395
+ ]
396
+ }
3_R_Analysis.ipynb ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "ir",
10
+ "display_name": "R"
11
+ },
12
+ "language_info": {
13
+ "name": "R"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {
21
+ "id": "PjBjk0dxe_u7",
22
+ "colab": {
23
+ "base_uri": "https://localhost:8080/",
24
+ "height": 1000
25
+ },
26
+ "outputId": "26aa869d-f8e4-45fe-a7f5-75e58416d422"
27
+ },
28
+ "outputs": [
29
+ {
30
+ "output_type": "stream",
31
+ "name": "stderr",
32
+ "text": [
33
+ "Installing packages into ‘/usr/local/lib/R/site-library’\n",
34
+ "(as ‘lib’ is unspecified)\n",
35
+ "\n",
36
+ "also installing the dependencies ‘iterators’, ‘foreach’, ‘shape’, ‘RcppEigen’\n",
37
+ "\n",
38
+ "\n",
39
+ "Loading required package: Matrix\n",
40
+ "\n",
41
+ "Loaded glmnet 4.1-10\n",
42
+ "\n",
43
+ "Type 'citation(\"pROC\")' for a citation.\n",
44
+ "\n",
45
+ "\n",
46
+ "Attaching package: ‘pROC’\n",
47
+ "\n",
48
+ "\n",
49
+ "The following objects are masked from ‘package:stats’:\n",
50
+ "\n",
51
+ " cov, smooth, var\n",
52
+ "\n",
53
+ "\n"
54
+ ]
55
+ },
56
+ {
57
+ "output_type": "stream",
58
+ "name": "stdout",
59
+ "text": [
60
+ "Loaded dataset: /content/hotel_cancel_model_dataset.csv \n",
61
+ "Shape: 36285 rows x 11 cols\n",
62
+ "Target: is_canceled \n",
63
+ "Split column: split \n",
64
+ "Features: lead_time, average_price, total_nights, total_guests, market_segment_type, type_of_meal, special_requests, price_per_guest \n",
65
+ "\n",
66
+ "Split distribution:\n",
67
+ "\n",
68
+ " test train \n",
69
+ " 7257 29028 \n",
70
+ "\n",
71
+ "Overall cancel rate: 0.3276561 \n",
72
+ "\n",
73
+ "Train rows: 29028 | Test rows: 7257 \n",
74
+ "Train cancel rate: 0.3276492 \n",
75
+ "Test cancel rate : 0.3276836 \n",
76
+ "\n",
77
+ "Chosen lambda (1se): 0.01835658 \n",
78
+ "\n",
79
+ "=== Test Metrics (LASSO Logistic Regression) ===\n",
80
+ "Accuracy: 0.7915 \n",
81
+ "Precision: 0.7658 \n",
82
+ "Recall: 0.524 \n",
83
+ "F1: 0.6222 \n",
84
+ "ROC AUC: 0.8511 \n",
85
+ "\n",
86
+ "Confusion Matrix:\n",
87
+ " Pred 0 Pred 1\n",
88
+ "Actual 0 4498 381\n",
89
+ "Actual 1 1132 1246\n"
90
+ ]
91
+ },
92
+ {
93
+ "output_type": "display_data",
94
+ "data": {
95
+ "text/html": [
96
+ "<strong>agg_record_904241799:</strong> 2"
97
+ ],
98
+ "text/markdown": "**agg_record_904241799:** 2",
99
+ "text/latex": "\\textbf{agg\\textbackslash{}\\_record\\textbackslash{}\\_904241799:} 2",
100
+ "text/plain": [
101
+ "agg_record_904241799 \n",
102
+ " 2 "
103
+ ]
104
+ },
105
+ "metadata": {}
106
+ },
107
+ {
108
+ "output_type": "stream",
109
+ "name": "stdout",
110
+ "text": [
111
+ "✅ Saved: /content/artifacts/r/figures/roc_curve.png \n",
112
+ "✅ Saved: /content/artifacts/r/metrics/metrics.json \n",
113
+ "✅ Saved: /content/artifacts/r/tables/test_predictions.csv \n",
114
+ "✅ Saved: /content/artifacts/r/tables/coefficients.csv \n",
115
+ "✅ Saved: /content/artifacts/r/models/model.rds \n",
116
+ "✅ Saved: /content/artifacts/r/example_input.json \n",
117
+ "\n",
118
+ "All done. R artifacts saved to: /content/artifacts/r \n",
119
+ "Next: build app.py to run notebooks and display artifacts/py and artifacts/r side-by-side.\n"
120
+ ]
121
+ }
122
+ ],
123
+ "source": [
124
+ "# ============================================================\n",
125
+ "# Hotel Booking Cancellation - R Model Notebook (Logistic Regression via glmnet LASSO)\n",
126
+ "# ============================================================\n",
127
+ "# This notebook/script:\n",
128
+ "# 1) Loads the modeling dataset created in Data Preparation:\n",
129
+ "# - hotel_cancel_model_dataset.csv\n",
130
+ "# - features.json\n",
131
+ "# - (optionally) dataset_meta.json\n",
132
+ "# 2) Uses the fixed \"split\" column to train on train rows and evaluate on test rows\n",
133
+ "# 3) Trains a LASSO Logistic Regression model (glmnet, alpha=1) with CV\n",
134
+ "# 4) Exports Hugging Face app–ready artifacts to artifacts/r/:\n",
135
+ "# - model.rds\n",
136
+ "# - metrics.json\n",
137
+ "# - roc_curve.png\n",
138
+ "# - coefficients.csv\n",
139
+ "# - test_predictions.csv\n",
140
+ "# - example_input.json (for app testing)\n",
141
+ "# ============================================================\n",
142
+ "\n",
143
+ "# ============================================================\n",
144
+ "# 0) Libraries (install if needed)\n",
145
+ "# ============================================================\n",
146
+ "required_pkgs <- c(\"jsonlite\", \"glmnet\", \"Matrix\", \"pROC\")\n",
147
+ "to_install <- required_pkgs[!required_pkgs %in% rownames(installed.packages())]\n",
148
+ "if (length(to_install) > 0) {\n",
149
+ " install.packages(to_install, repos = \"https://cloud.r-project.org\")\n",
150
+ "}\n",
151
+ "\n",
152
+ "library(jsonlite)\n",
153
+ "library(glmnet)\n",
154
+ "library(Matrix)\n",
155
+ "library(pROC)\n",
156
+ "\n",
157
+ "# ============================================================\n",
158
+ "# 1) Paths / Inputs\n",
159
+ "# ============================================================\n",
160
+ "BASE_DIR <- getwd()\n",
161
+ "\n",
162
+ "DATASET_PATH <- file.path(BASE_DIR, \"hotel_cancel_model_dataset.csv\")\n",
163
+ "FEATURES_PATH <- file.path(BASE_DIR, \"features.json\")\n",
164
+ "\n",
165
+ "if (!file.exists(DATASET_PATH)) stop(paste(\"Missing file:\", DATASET_PATH, \"Run Data Preparation first.\"))\n",
166
+ "if (!file.exists(FEATURES_PATH)) stop(paste(\"Missing file:\", FEATURES_PATH, \"Run Data Preparation first.\"))\n",
167
+ "\n",
168
+ "# Output folders for Hugging Face app to display\n",
169
+ "ARTIFACTS_DIR <- file.path(BASE_DIR, \"artifacts\", \"r\")\n",
170
+ "FIG_DIR <- file.path(ARTIFACTS_DIR, \"figures\")\n",
171
+ "METRICS_DIR <- file.path(ARTIFACTS_DIR, \"metrics\")\n",
172
+ "MODELS_DIR <- file.path(ARTIFACTS_DIR, \"models\")\n",
173
+ "TABLES_DIR <- file.path(ARTIFACTS_DIR, \"tables\")\n",
174
+ "\n",
175
+ "dir.create(FIG_DIR, recursive = TRUE, showWarnings = FALSE)\n",
176
+ "dir.create(METRICS_DIR, recursive = TRUE, showWarnings = FALSE)\n",
177
+ "dir.create(MODELS_DIR, recursive = TRUE, showWarnings = FALSE)\n",
178
+ "dir.create(TABLES_DIR, recursive = TRUE, showWarnings = FALSE)\n",
179
+ "\n",
180
+ "# ============================================================\n",
181
+ "# 2) Load dataset + schema (features.json)\n",
182
+ "# ============================================================\n",
183
+ "df <- read.csv(DATASET_PATH, stringsAsFactors = FALSE)\n",
184
+ "\n",
185
+ "schema <- fromJSON(FEATURES_PATH)\n",
186
+ "FEATURES <- schema$features\n",
187
+ "TARGET <- schema$target\n",
188
+ "SPLIT_COL <- schema$split_col\n",
189
+ "ID_COL <- schema$id_col\n",
190
+ "\n",
191
+ "cat(\"Loaded dataset:\", DATASET_PATH, \"\\n\")\n",
192
+ "cat(\"Shape:\", nrow(df), \"rows x\", ncol(df), \"cols\\n\")\n",
193
+ "cat(\"Target:\", TARGET, \"\\n\")\n",
194
+ "cat(\"Split column:\", SPLIT_COL, \"\\n\")\n",
195
+ "cat(\"Features:\", paste(FEATURES, collapse = \", \"), \"\\n\\n\")\n",
196
+ "\n",
197
+ "# Basic validation\n",
198
+ "missing_cols <- setdiff(c(FEATURES, TARGET, SPLIT_COL), colnames(df))\n",
199
+ "if (length(missing_cols) > 0) {\n",
200
+ " stop(paste(\"Dataset is missing required columns:\", paste(missing_cols, collapse = \", \")))\n",
201
+ "}\n",
202
+ "\n",
203
+ "cat(\"Split distribution:\\n\")\n",
204
+ "print(table(df[[SPLIT_COL]]))\n",
205
+ "cat(\"\\nOverall cancel rate:\", mean(df[[TARGET]]), \"\\n\\n\")\n",
206
+ "\n",
207
+ "# ============================================================\n",
208
+ "# 3) Train/Test split using the fixed split column\n",
209
+ "# ============================================================\n",
210
+ "train_df <- subset(df, df[[SPLIT_COL]] == \"train\")\n",
211
+ "test_df <- subset(df, df[[SPLIT_COL]] == \"test\")\n",
212
+ "\n",
213
+ "if (nrow(train_df) == 0 || nrow(test_df) == 0) {\n",
214
+ " stop(\"Train or test split is empty. Check the 'split' column in the dataset.\")\n",
215
+ "}\n",
216
+ "\n",
217
+ "y_train <- as.integer(train_df[[TARGET]])\n",
218
+ "y_test <- as.integer(test_df[[TARGET]])\n",
219
+ "\n",
220
+ "cat(\"Train rows:\", nrow(train_df), \" | Test rows:\", nrow(test_df), \"\\n\")\n",
221
+ "cat(\"Train cancel rate:\", mean(y_train), \"\\n\")\n",
222
+ "cat(\"Test cancel rate :\", mean(y_test), \"\\n\\n\")\n",
223
+ "\n",
224
+ "# ============================================================\n",
225
+ "# 4) Build design matrices (one-hot encoding) for glmnet\n",
226
+ "# ============================================================\n",
227
+ "# We create a formula: ~ feature1 + feature2 + ...\n",
228
+ "# and use sparse.model.matrix to one-hot encode categoricals.\n",
229
+ "# IMPORTANT: Use exactly the features from features.json.\n",
230
+ "\n",
231
+ "formula_str <- paste(\"~\", paste(FEATURES, collapse = \" + \"))\n",
232
+ "f <- as.formula(formula_str)\n",
233
+ "\n",
234
+ "# glmnet expects numeric matrix; sparse.model.matrix handles factors and produces sparse matrix\n",
235
+ "# Convert characters to factors for correct one-hot encoding\n",
236
+ "train_x_df <- train_df[, FEATURES, drop = FALSE]\n",
237
+ "test_x_df <- test_df[, FEATURES, drop = FALSE]\n",
238
+ "\n",
239
+ "# Ensure consistent factor levels between train and test:\n",
240
+ "# - Convert character columns to factor using combined data levels\n",
241
+ "for (col in FEATURES) {\n",
242
+ " if (is.character(train_x_df[[col]]) || is.character(test_x_df[[col]])) {\n",
243
+ " all_levels <- unique(c(train_x_df[[col]], test_x_df[[col]]))\n",
244
+ " train_x_df[[col]] <- factor(train_x_df[[col]], levels = all_levels)\n",
245
+ " test_x_df[[col]] <- factor(test_x_df[[col]], levels = all_levels)\n",
246
+ " }\n",
247
+ "}\n",
248
+ "\n",
249
+ "X_train <- sparse.model.matrix(f, data = train_x_df)[, -1, drop = FALSE] # drop intercept column\n",
250
+ "X_test <- sparse.model.matrix(f, data = test_x_df)[, -1, drop = FALSE]\n",
251
+ "\n",
252
+ "# ============================================================\n",
253
+ "# 5) Train LASSO Logistic Regression with cross-validation\n",
254
+ "# ============================================================\n",
255
+ "set.seed(42)\n",
256
+ "\n",
257
+ "# alpha = 1 => LASSO, family = \"binomial\" => logistic regression\n",
258
+ "cv_fit <- cv.glmnet(\n",
259
+ " x = X_train,\n",
260
+ " y = y_train,\n",
261
+ " family = \"binomial\",\n",
262
+ " alpha = 1,\n",
263
+ " nfolds = 5,\n",
264
+ " type.measure = \"auc\" # optimize CV for AUC (good for imbalanced classification)\n",
265
+ ")\n",
266
+ "\n",
267
+ "best_lambda <- cv_fit$lambda.1se # more regularized (safer); alternatively lambda.min\n",
268
+ "cat(\"Chosen lambda (1se):\", best_lambda, \"\\n\\n\")\n",
269
+ "\n",
270
+ "# ============================================================\n",
271
+ "# 6) Predict + evaluate on test set\n",
272
+ "# ============================================================\n",
273
+ "# Predicted probabilities for class 1 (canceled)\n",
274
+ "proba_test <- as.numeric(predict(cv_fit, newx = X_test, s = best_lambda, type = \"response\"))\n",
275
+ "\n",
276
+ "# Convert to class labels (threshold 0.5)\n",
277
+ "pred_test <- ifelse(proba_test >= 0.5, 1L, 0L)\n",
278
+ "\n",
279
+ "# Metrics\n",
280
+ "accuracy <- mean(pred_test == y_test)\n",
281
+ "\n",
282
+ "# Precision / Recall / F1 (manual)\n",
283
+ "tp <- sum(pred_test == 1 & y_test == 1)\n",
284
+ "tn <- sum(pred_test == 0 & y_test == 0)\n",
285
+ "fp <- sum(pred_test == 1 & y_test == 0)\n",
286
+ "fn <- sum(pred_test == 0 & y_test == 1)\n",
287
+ "\n",
288
+ "precision <- ifelse((tp + fp) > 0, tp / (tp + fp), 0)\n",
289
+ "recall <- ifelse((tp + fn) > 0, tp / (tp + fn), 0)\n",
290
+ "f1 <- ifelse((precision + recall) > 0, 2 * precision * recall / (precision + recall), 0)\n",
291
+ "\n",
292
+ "# ROC AUC\n",
293
+ "roc_obj <- pROC::roc(response = y_test, predictor = proba_test, quiet = TRUE)\n",
294
+ "auc <- as.numeric(pROC::auc(roc_obj))\n",
295
+ "\n",
296
+ "conf_mat <- matrix(c(tn, fp, fn, tp), nrow = 2, byrow = TRUE)\n",
297
+ "colnames(conf_mat) <- c(\"Pred 0\", \"Pred 1\")\n",
298
+ "rownames(conf_mat) <- c(\"Actual 0\", \"Actual 1\")\n",
299
+ "\n",
300
+ "cat(\"=== Test Metrics (LASSO Logistic Regression) ===\\n\")\n",
301
+ "cat(\"Accuracy:\", round(accuracy, 4), \"\\n\")\n",
302
+ "cat(\"Precision:\", round(precision, 4), \"\\n\")\n",
303
+ "cat(\"Recall:\", round(recall, 4), \"\\n\")\n",
304
+ "cat(\"F1:\", round(f1, 4), \"\\n\")\n",
305
+ "cat(\"ROC AUC:\", round(auc, 4), \"\\n\\n\")\n",
306
+ "cat(\"Confusion Matrix:\\n\")\n",
307
+ "print(conf_mat)\n",
308
+ "\n",
309
+ "# ============================================================\n",
310
+ "# 7) Save ROC curve plot (for Hugging Face app)\n",
311
+ "# ============================================================\n",
312
+ "roc_path <- file.path(FIG_DIR, \"roc_curve.png\")\n",
313
+ "png(filename = roc_path, width = 900, height = 700)\n",
314
+ "plot(roc_obj, main = \"ROC Curve (LASSO Logistic Regression)\", col = \"blue\", lwd = 2)\n",
315
+ "abline(a = 0, b = 1, lty = 2, col = \"gray40\")\n",
316
+ "dev.off()\n",
317
+ "cat(\"✅ Saved:\", roc_path, \"\\n\")\n",
318
+ "\n",
319
+ "# ============================================================\n",
320
+ "# 8) Save metrics.json (for Hugging Face app)\n",
321
+ "# ============================================================\n",
322
+ "metrics_payload <- list(\n",
323
+ " model_name = \"glmnet_lasso_logistic\",\n",
324
+ " features_used = FEATURES,\n",
325
+ " lambda_1se = best_lambda,\n",
326
+ " n_train = nrow(train_df),\n",
327
+ " n_test = nrow(test_df),\n",
328
+ " cancel_rate_train = mean(y_train),\n",
329
+ " cancel_rate_test = mean(y_test),\n",
330
+ " accuracy = accuracy,\n",
331
+ " precision = precision,\n",
332
+ " recall = recall,\n",
333
+ " f1 = f1,\n",
334
+ " roc_auc = auc,\n",
335
+ " confusion_matrix = list(\n",
336
+ " tn = tn, fp = fp, fn = fn, tp = tp\n",
337
+ " )\n",
338
+ ")\n",
339
+ "\n",
340
+ "metrics_path <- file.path(METRICS_DIR, \"metrics.json\")\n",
341
+ "writeLines(jsonlite::toJSON(metrics_payload, pretty = TRUE, auto_unbox = TRUE), con = metrics_path)\n",
342
+ "cat(\"✅ Saved:\", metrics_path, \"\\n\")\n",
343
+ "\n",
344
+ "# ============================================================\n",
345
+ "# 9) Save test predictions table (for app display)\n",
346
+ "# ============================================================\n",
347
+ "pred_out <- test_df[, c(if (!is.null(ID_COL) && ID_COL %in% colnames(test_df)) ID_COL else NULL, FEATURES, TARGET), drop = FALSE]\n",
348
+ "pred_out$pred_label <- pred_test\n",
349
+ "pred_out$pred_proba_canceled <- proba_test\n",
350
+ "\n",
351
+ "# Sort by highest risk\n",
352
+ "pred_out <- pred_out[order(-pred_out$pred_proba_canceled), ]\n",
353
+ "\n",
354
+ "pred_path <- file.path(TABLES_DIR, \"test_predictions.csv\")\n",
355
+ "write.csv(pred_out, pred_path, row.names = FALSE)\n",
356
+ "cat(\"✅ Saved:\", pred_path, \"\\n\")\n",
357
+ "\n",
358
+ "# ============================================================\n",
359
+ "# 10) Save coefficients (feature importance for logistic regression)\n",
360
+ "# ============================================================\n",
361
+ "# Coefficients at selected lambda\n",
362
+ "coef_mat <- as.matrix(coef(cv_fit, s = best_lambda))\n",
363
+ "coef_df <- data.frame(\n",
364
+ " feature = rownames(coef_mat),\n",
365
+ " coefficient = as.numeric(coef_mat[, 1]),\n",
366
+ " stringsAsFactors = FALSE\n",
367
+ ")\n",
368
+ "\n",
369
+ "# Remove intercept and sort by absolute magnitude\n",
370
+ "coef_df <- subset(coef_df, feature != \"(Intercept)\")\n",
371
+ "coef_df$abs_coeff <- abs(coef_df$coefficient)\n",
372
+ "coef_df <- coef_df[order(-coef_df$abs_coeff), ]\n",
373
+ "coef_df$abs_coeff <- NULL\n",
374
+ "\n",
375
+ "coef_path <- file.path(TABLES_DIR, \"coefficients.csv\")\n",
376
+ "write.csv(coef_df, coef_path, row.names = FALSE)\n",
377
+ "cat(\"✅ Saved:\", coef_path, \"\\n\")\n",
378
+ "\n",
379
+ "# ============================================================\n",
380
+ "# 11) Save trained model (RDS) for potential inference\n",
381
+ "# ============================================================\n",
382
+ "model_path <- file.path(MODELS_DIR, \"model.rds\")\n",
383
+ "saveRDS(cv_fit, model_path)\n",
384
+ "cat(\"✅ Saved:\", model_path, \"\\n\")\n",
385
+ "\n",
386
+ "# ============================================================\n",
387
+ "# 12) Save an example input row for the app's \"Predict\" tab\n",
388
+ "# ============================================================\n",
389
+ "example_row <- as.list(train_df[1, FEATURES, drop = FALSE])\n",
390
+ "example_path <- file.path(ARTIFACTS_DIR, \"example_input.json\")\n",
391
+ "writeLines(jsonlite::toJSON(example_row, pretty = TRUE, auto_unbox = TRUE), con = example_path)\n",
392
+ "cat(\"✅ Saved:\", example_path, \"\\n\\n\")\n",
393
+ "\n",
394
+ "cat(\"All done. R artifacts saved to:\", ARTIFACTS_DIR, \"\\n\")\n",
395
+ "cat(\"Next: build app.py to run notebooks and display artifacts/py and artifacts/r side-by-side.\\n\")"
396
+ ]
397
+ }
398
+ ]
399
+ }
booking.csv ADDED
The diff for this file is too large to render. See raw diff