Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- 1_Data_Creation.ipynb +288 -0
- 2_Python_Analysis.ipynb +396 -0
- 3_R_Analysis.ipynb +399 -0
- booking.csv +0 -0
1_Data_Creation.ipynb
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 1,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"colab": {
|
| 22 |
+
"base_uri": "https://localhost:8080/"
|
| 23 |
+
},
|
| 24 |
+
"id": "OnYkKEUJVrSc",
|
| 25 |
+
"outputId": "cd7f7280-d1b1-479d-94ac-a14fb1615d28"
|
| 26 |
+
},
|
| 27 |
+
"outputs": [
|
| 28 |
+
{
|
| 29 |
+
"output_type": "stream",
|
| 30 |
+
"name": "stdout",
|
| 31 |
+
"text": [
|
| 32 |
+
"Raw shape: (36285, 17)\n",
|
| 33 |
+
"Raw columns: ['Booking_ID', 'number of adults', 'number of children', 'number of weekend nights', 'number of week nights', 'type of meal', 'car parking space', 'room type', 'lead time', 'market segment type', 'repeated', 'P-C', 'P-not-C', 'average price', 'special requests', 'date of reservation', 'booking status']\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"Columns after renaming: ['booking_id', 'number_of_adults', 'number_of_children', 'number_of_weekend_nights', 'number_of_week_nights', 'type_of_meal', 'car_parking_space', 'room_type', 'lead_time', 'market_segment_type', 'repeated', 'p_c', 'p_not_c', 'average_price', 'special_requests', 'date_of_reservation', 'booking_status']\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"Cleaned shape: (36285, 18)\n",
|
| 38 |
+
"Overall cancel rate: 0.3277\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"Split distribution:\n",
|
| 41 |
+
"split\n",
|
| 42 |
+
"train 29028\n",
|
| 43 |
+
"test 7257\n",
|
| 44 |
+
"Name: count, dtype: int64\n",
|
| 45 |
+
"Train cancel rate: 0.3276\n",
|
| 46 |
+
"Test cancel rate : 0.3277\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"✅ Saved: hotel_cancel_model_dataset.csv | shape=(36285, 11)\n",
|
| 49 |
+
"✅ Saved: train.csv, test.csv\n",
|
| 50 |
+
"✅ Saved: features.json\n",
|
| 51 |
+
"✅ Saved: dataset_meta.json\n",
|
| 52 |
+
"\n",
|
| 53 |
+
"Data preparation completed successfully.\n"
|
| 54 |
+
]
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"source": [
|
| 58 |
+
"# ============================================================\n",
|
| 59 |
+
"# Hotel Booking Cancellation - Data Preparation Notebook\n",
|
| 60 |
+
"# ============================================================\n",
|
| 61 |
+
"# This notebook:\n",
|
| 62 |
+
"# 1) Loads booking.csv (from HF repo root)\n",
|
| 63 |
+
"# 2) Cleans and standardizes columns\n",
|
| 64 |
+
"# 3) Engineers EXACTLY 8 modeling features\n",
|
| 65 |
+
"# 4) Creates target variable (is_canceled)\n",
|
| 66 |
+
"# 5) Creates fixed stratified train/test split column\n",
|
| 67 |
+
"# 6) Exports dataset + metadata for Python & R notebooks\n",
|
| 68 |
+
"# ============================================================\n",
|
| 69 |
+
"\n",
|
| 70 |
+
"import json\n",
|
| 71 |
+
"import hashlib\n",
|
| 72 |
+
"from pathlib import Path\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"import pandas as pd\n",
|
| 75 |
+
"import numpy as np\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"# ============================================================\n",
|
| 78 |
+
"# 1) Load raw dataset (booking.csv must be in repo root)\n",
|
| 79 |
+
"# ============================================================\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"BASE_DIR = Path.cwd()\n",
|
| 82 |
+
"FILE_PATH = BASE_DIR / \"booking.csv\"\n",
|
| 83 |
+
"\n",
|
| 84 |
+
"if not FILE_PATH.exists():\n",
|
| 85 |
+
" raise FileNotFoundError(\n",
|
| 86 |
+
" f\"booking.csv not found in {BASE_DIR}. \"\n",
|
| 87 |
+
" \"Make sure the file is uploaded to the Hugging Face repo root.\"\n",
|
| 88 |
+
" )\n",
|
| 89 |
+
"\n",
|
| 90 |
+
"df_raw = pd.read_csv(FILE_PATH)\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"print(\"Raw shape:\", df_raw.shape)\n",
|
| 93 |
+
"print(\"Raw columns:\", df_raw.columns.tolist())\n",
|
| 94 |
+
"\n",
|
| 95 |
+
"# ============================================================\n",
|
| 96 |
+
"# 2) Standardize column names to snake_case\n",
|
| 97 |
+
"# ============================================================\n",
|
| 98 |
+
"\n",
|
| 99 |
+
"def to_snake(s: str) -> str:\n",
|
| 100 |
+
" s = str(s).strip().lower()\n",
|
| 101 |
+
" s = s.replace(\"-\", \"_\").replace(\" \", \"_\")\n",
|
| 102 |
+
" s = \"\".join([c if (c.isalnum() or c == \"_\") else \"_\" for c in s])\n",
|
| 103 |
+
" s = \"_\".join([x for x in s.split(\"_\") if x])\n",
|
| 104 |
+
" return s\n",
|
| 105 |
+
"\n",
|
| 106 |
+
"df = df_raw.copy()\n",
|
| 107 |
+
"df.columns = [to_snake(c) for c in df.columns]\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"print(\"\\nColumns after renaming:\", df.columns.tolist())\n",
|
| 110 |
+
"\n",
|
| 111 |
+
"# ============================================================\n",
|
| 112 |
+
"# 3) Basic cleaning\n",
|
| 113 |
+
"# ============================================================\n",
|
| 114 |
+
"\n",
|
| 115 |
+
"# Numeric columns present in this Kaggle dataset\n",
|
| 116 |
+
"numeric_cols = [\n",
|
| 117 |
+
" \"number_of_adults\",\n",
|
| 118 |
+
" \"number_of_children\",\n",
|
| 119 |
+
" \"number_of_weekend_nights\",\n",
|
| 120 |
+
" \"number_of_week_nights\",\n",
|
| 121 |
+
" \"lead_time\",\n",
|
| 122 |
+
" \"average_price\",\n",
|
| 123 |
+
" \"special_requests\",\n",
|
| 124 |
+
" \"car_parking_space\",\n",
|
| 125 |
+
" \"repeated\",\n",
|
| 126 |
+
"]\n",
|
| 127 |
+
"\n",
|
| 128 |
+
"categorical_cols = [\n",
|
| 129 |
+
" \"market_segment_type\",\n",
|
| 130 |
+
" \"type_of_meal\",\n",
|
| 131 |
+
" \"room_type\", # not used in final model but cleaned\n",
|
| 132 |
+
"]\n",
|
| 133 |
+
"\n",
|
| 134 |
+
"# Convert numeric columns\n",
|
| 135 |
+
"for c in numeric_cols:\n",
|
| 136 |
+
" if c in df.columns:\n",
|
| 137 |
+
" df[c] = pd.to_numeric(df[c], errors=\"coerce\")\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"# Clean categorical columns\n",
|
| 140 |
+
"for c in categorical_cols:\n",
|
| 141 |
+
" if c in df.columns:\n",
|
| 142 |
+
" df[c] = (\n",
|
| 143 |
+
" df[c]\n",
|
| 144 |
+
" .astype(str)\n",
|
| 145 |
+
" .str.strip()\n",
|
| 146 |
+
" .replace({\"nan\": \"unknown\", \"none\": \"unknown\", \"\": \"unknown\"})\n",
|
| 147 |
+
" .fillna(\"unknown\")\n",
|
| 148 |
+
" )\n",
|
| 149 |
+
"\n",
|
| 150 |
+
"# Create target variable\n",
|
| 151 |
+
"if \"booking_status\" not in df.columns:\n",
|
| 152 |
+
" raise ValueError(\"Expected column 'booking_status' not found.\")\n",
|
| 153 |
+
"\n",
|
| 154 |
+
"df[\"booking_status\"] = df[\"booking_status\"].astype(str).str.strip()\n",
|
| 155 |
+
"df[\"is_canceled\"] = (df[\"booking_status\"].str.lower() == \"canceled\").astype(int)\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"# Handle missing numeric values using median\n",
|
| 158 |
+
"for c in numeric_cols:\n",
|
| 159 |
+
" if c in df.columns:\n",
|
| 160 |
+
" df[c] = df[c].fillna(df[c].median())\n",
|
| 161 |
+
"\n",
|
| 162 |
+
"print(\"\\nCleaned shape:\", df.shape)\n",
|
| 163 |
+
"print(\"Overall cancel rate:\", round(df[\"is_canceled\"].mean(), 4))\n",
|
| 164 |
+
"\n",
|
| 165 |
+
"# ============================================================\n",
|
| 166 |
+
"# 4) Feature Engineering (FINAL 8 FEATURES)\n",
|
| 167 |
+
"# ============================================================\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"# 4.1 Total nights\n",
|
| 170 |
+
"df[\"total_nights\"] = (\n",
|
| 171 |
+
" df[\"number_of_weekend_nights\"] + df[\"number_of_week_nights\"]\n",
|
| 172 |
+
")\n",
|
| 173 |
+
"\n",
|
| 174 |
+
"# 4.2 Total guests\n",
|
| 175 |
+
"df[\"total_guests\"] = (\n",
|
| 176 |
+
" df[\"number_of_adults\"] + df[\"number_of_children\"]\n",
|
| 177 |
+
")\n",
|
| 178 |
+
"\n",
|
| 179 |
+
"# 4.3 Price per guest (avoid division by zero)\n",
|
| 180 |
+
"denom = np.where(df[\"total_guests\"] > 0, df[\"total_guests\"], 1)\n",
|
| 181 |
+
"df[\"price_per_guest\"] = df[\"average_price\"] / denom\n",
|
| 182 |
+
"\n",
|
| 183 |
+
"# ============================================================\n",
|
| 184 |
+
"# 5) Select EXACT 8 features\n",
|
| 185 |
+
"# ============================================================\n",
|
| 186 |
+
"\n",
|
| 187 |
+
"FINAL_FEATURES = [\n",
|
| 188 |
+
" \"lead_time\",\n",
|
| 189 |
+
" \"average_price\",\n",
|
| 190 |
+
" \"total_nights\",\n",
|
| 191 |
+
" \"total_guests\",\n",
|
| 192 |
+
" \"market_segment_type\",\n",
|
| 193 |
+
" \"type_of_meal\",\n",
|
| 194 |
+
" \"special_requests\",\n",
|
| 195 |
+
" \"price_per_guest\",\n",
|
| 196 |
+
"]\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"missing = [c for c in FINAL_FEATURES if c not in df.columns]\n",
|
| 199 |
+
"if missing:\n",
|
| 200 |
+
" raise ValueError(f\"Missing required feature columns: {missing}\")\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"id_col = \"booking_id\" if \"booking_id\" in df.columns else None\n",
|
| 203 |
+
"\n",
|
| 204 |
+
"export_cols = ([id_col] if id_col else []) + FINAL_FEATURES + [\"is_canceled\"]\n",
|
| 205 |
+
"df_model = df[export_cols].copy()\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"# ============================================================\n",
|
| 208 |
+
"# 6) Create FIXED stratified train/test split\n",
|
| 209 |
+
"# ============================================================\n",
|
| 210 |
+
"\n",
|
| 211 |
+
"rng = np.random.default_rng(42)\n",
|
| 212 |
+
"df_model[\"split\"] = \"train\"\n",
|
| 213 |
+
"\n",
|
| 214 |
+
"for label in [0, 1]:\n",
|
| 215 |
+
" idx = df_model.index[df_model[\"is_canceled\"] == label].to_numpy()\n",
|
| 216 |
+
" rng.shuffle(idx)\n",
|
| 217 |
+
" test_size = int(round(0.2 * len(idx)))\n",
|
| 218 |
+
" test_idx = idx[:test_size]\n",
|
| 219 |
+
" df_model.loc[test_idx, \"split\"] = \"test\"\n",
|
| 220 |
+
"\n",
|
| 221 |
+
"print(\"\\nSplit distribution:\")\n",
|
| 222 |
+
"print(df_model[\"split\"].value_counts())\n",
|
| 223 |
+
"print(\"Train cancel rate:\", round(df_model[df_model[\"split\"] == \"train\"][\"is_canceled\"].mean(), 4))\n",
|
| 224 |
+
"print(\"Test cancel rate :\", round(df_model[df_model[\"split\"] == \"test\"][\"is_canceled\"].mean(), 4))\n",
|
| 225 |
+
"\n",
|
| 226 |
+
"# ============================================================\n",
|
| 227 |
+
"# 7) Export files for Python + R notebooks + HF app\n",
|
| 228 |
+
"# ============================================================\n",
|
| 229 |
+
"\n",
|
| 230 |
+
"OUT_DATASET = \"hotel_cancel_model_dataset.csv\"\n",
|
| 231 |
+
"df_model.to_csv(OUT_DATASET, index=False, encoding=\"utf-8\")\n",
|
| 232 |
+
"print(f\"\\n✅ Saved: {OUT_DATASET} | shape={df_model.shape}\")\n",
|
| 233 |
+
"\n",
|
| 234 |
+
"# Optional convenience splits\n",
|
| 235 |
+
"df_model[df_model[\"split\"] == \"train\"].to_csv(\"train.csv\", index=False)\n",
|
| 236 |
+
"df_model[df_model[\"split\"] == \"test\"].to_csv(\"test.csv\", index=False)\n",
|
| 237 |
+
"print(\"✅ Saved: train.csv, test.csv\")\n",
|
| 238 |
+
"\n",
|
| 239 |
+
"# ============================================================\n",
|
| 240 |
+
"# 8) Export feature schema (for Python model + app)\n",
|
| 241 |
+
"# ============================================================\n",
|
| 242 |
+
"\n",
|
| 243 |
+
"features_payload = {\n",
|
| 244 |
+
" \"id_col\": id_col,\n",
|
| 245 |
+
" \"target\": \"is_canceled\",\n",
|
| 246 |
+
" \"split_col\": \"split\",\n",
|
| 247 |
+
" \"features\": FINAL_FEATURES,\n",
|
| 248 |
+
" \"categorical_features\": [\"market_segment_type\", \"type_of_meal\"],\n",
|
| 249 |
+
" \"numeric_features\": [\n",
|
| 250 |
+
" \"lead_time\",\n",
|
| 251 |
+
" \"average_price\",\n",
|
| 252 |
+
" \"total_nights\",\n",
|
| 253 |
+
" \"total_guests\",\n",
|
| 254 |
+
" \"special_requests\",\n",
|
| 255 |
+
" \"price_per_guest\",\n",
|
| 256 |
+
" ],\n",
|
| 257 |
+
"}\n",
|
| 258 |
+
"\n",
|
| 259 |
+
"with open(\"features.json\", \"w\", encoding=\"utf-8\") as f:\n",
|
| 260 |
+
" json.dump(features_payload, f, indent=2)\n",
|
| 261 |
+
"\n",
|
| 262 |
+
"print(\"✅ Saved: features.json\")\n",
|
| 263 |
+
"\n",
|
| 264 |
+
"# ============================================================\n",
|
| 265 |
+
"# 9) Dataset fingerprint (reproducibility)\n",
|
| 266 |
+
"# ============================================================\n",
|
| 267 |
+
"\n",
|
| 268 |
+
"with open(OUT_DATASET, \"rb\") as f:\n",
|
| 269 |
+
" md5 = hashlib.md5(f.read()).hexdigest()\n",
|
| 270 |
+
"\n",
|
| 271 |
+
"dataset_meta = {\n",
|
| 272 |
+
" \"dataset_file\": OUT_DATASET,\n",
|
| 273 |
+
" \"md5\": md5,\n",
|
| 274 |
+
" \"rows\": int(df_model.shape[0]),\n",
|
| 275 |
+
" \"cols\": int(df_model.shape[1]),\n",
|
| 276 |
+
" \"cancel_rate_overall\": float(df_model[\"is_canceled\"].mean()),\n",
|
| 277 |
+
"}\n",
|
| 278 |
+
"\n",
|
| 279 |
+
"with open(\"dataset_meta.json\", \"w\", encoding=\"utf-8\") as f:\n",
|
| 280 |
+
" json.dump(dataset_meta, f, indent=2)\n",
|
| 281 |
+
"\n",
|
| 282 |
+
"print(\"✅ Saved: dataset_meta.json\")\n",
|
| 283 |
+
"\n",
|
| 284 |
+
"print(\"\\nData preparation completed successfully.\")"
|
| 285 |
+
]
|
| 286 |
+
}
|
| 287 |
+
]
|
| 288 |
+
}
|
2_Python_Analysis.ipynb
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 1,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"colab": {
|
| 22 |
+
"base_uri": "https://localhost:8080/"
|
| 23 |
+
},
|
| 24 |
+
"id": "3F3JK2ZTayYg",
|
| 25 |
+
"outputId": "a724f9a3-5603-4db5-9e38-095323467abe"
|
| 26 |
+
},
|
| 27 |
+
"outputs": [
|
| 28 |
+
{
|
| 29 |
+
"output_type": "stream",
|
| 30 |
+
"name": "stdout",
|
| 31 |
+
"text": [
|
| 32 |
+
"Loaded dataset: /content/hotel_cancel_model_dataset.csv\n",
|
| 33 |
+
"Shape: (36285, 11)\n",
|
| 34 |
+
"Features: ['lead_time', 'average_price', 'total_nights', 'total_guests', 'market_segment_type', 'type_of_meal', 'special_requests', 'price_per_guest']\n",
|
| 35 |
+
"Target: is_canceled\n",
|
| 36 |
+
"Split column: split\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"Split distribution:\n",
|
| 39 |
+
"split\n",
|
| 40 |
+
"train 29028\n",
|
| 41 |
+
"test 7257\n",
|
| 42 |
+
"Name: count, dtype: int64\n",
|
| 43 |
+
"Overall cancel rate: 0.3277\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"Train rows: 29028 | Test rows: 7257\n",
|
| 46 |
+
"Train cancel rate: 0.3276\n",
|
| 47 |
+
"Test cancel rate : 0.3277\n",
|
| 48 |
+
"\n",
|
| 49 |
+
"=== Test Metrics (Random Forest) ===\n",
|
| 50 |
+
"Accuracy: 0.8794\n",
|
| 51 |
+
"Precision: 0.8417\n",
|
| 52 |
+
"Recall: 0.7784\n",
|
| 53 |
+
"F1: 0.8088\n",
|
| 54 |
+
"ROC AUC: 0.9382\n",
|
| 55 |
+
"\n",
|
| 56 |
+
"Confusion Matrix:\n",
|
| 57 |
+
" [[4531 348]\n",
|
| 58 |
+
" [ 527 1851]]\n",
|
| 59 |
+
"✅ Saved: /content/artifacts/py/figures/confusion_matrix.png\n",
|
| 60 |
+
"✅ Saved: /content/artifacts/py/figures/roc_curve.png\n",
|
| 61 |
+
"✅ Saved: /content/artifacts/py/metrics/metrics.json\n",
|
| 62 |
+
"✅ Saved: /content/artifacts/py/tables/test_predictions.csv\n",
|
| 63 |
+
"✅ Saved: /content/artifacts/py/tables/feature_importances.csv\n",
|
| 64 |
+
"✅ Saved: /content/artifacts/py/models/model.joblib\n",
|
| 65 |
+
"✅ Saved: /content/artifacts/py/example_input.json\n",
|
| 66 |
+
"\n",
|
| 67 |
+
"All done. Python artifacts saved to: /content/artifacts/py\n",
|
| 68 |
+
"Next: update the R notebook to use the same split + features and output artifacts/r/...\n"
|
| 69 |
+
]
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"source": [
|
| 73 |
+
"# ============================================================\n",
|
| 74 |
+
"# Hotel Booking Cancellation - Python Model Notebook (Random Forest)\n",
|
| 75 |
+
"# ============================================================\n",
|
| 76 |
+
"# This notebook/script:\n",
|
| 77 |
+
"# 1) Loads the modeling dataset created in the Data Preparation step:\n",
|
| 78 |
+
"# - hotel_cancel_model_dataset.csv\n",
|
| 79 |
+
"# - features.json\n",
|
| 80 |
+
"# - (optionally) dataset_meta.json\n",
|
| 81 |
+
"# 2) Uses the fixed \"split\" column to train on train rows and evaluate on test rows\n",
|
| 82 |
+
"# 3) Trains a RandomForest model inside a single sklearn Pipeline:\n",
|
| 83 |
+
"# - Numeric: median imputation\n",
|
| 84 |
+
"# - Categorical: most_frequent imputation + OneHotEncode\n",
|
| 85 |
+
"# 4) Exports Hugging Face app–ready artifacts to artifacts/py/:\n",
|
| 86 |
+
"# - model.joblib\n",
|
| 87 |
+
"# - metrics.json\n",
|
| 88 |
+
"# - confusion_matrix.png\n",
|
| 89 |
+
"# - roc_curve.png\n",
|
| 90 |
+
"# - feature_importances.csv\n",
|
| 91 |
+
"# - test_predictions.csv\n",
|
| 92 |
+
"# - example_input.json\n",
|
| 93 |
+
"# ============================================================\n",
|
| 94 |
+
"\n",
|
| 95 |
+
"import json\n",
|
| 96 |
+
"from pathlib import Path\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"import numpy as np\n",
|
| 99 |
+
"import pandas as pd\n",
|
| 100 |
+
"import matplotlib.pyplot as plt\n",
|
| 101 |
+
"\n",
|
| 102 |
+
"from sklearn.compose import ColumnTransformer\n",
|
| 103 |
+
"from sklearn.impute import SimpleImputer\n",
|
| 104 |
+
"from sklearn.pipeline import Pipeline\n",
|
| 105 |
+
"from sklearn.preprocessing import OneHotEncoder\n",
|
| 106 |
+
"from sklearn.ensemble import RandomForestClassifier\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"from sklearn.metrics import (\n",
|
| 109 |
+
" accuracy_score,\n",
|
| 110 |
+
" precision_score,\n",
|
| 111 |
+
" recall_score,\n",
|
| 112 |
+
" f1_score,\n",
|
| 113 |
+
" roc_auc_score,\n",
|
| 114 |
+
" roc_curve,\n",
|
| 115 |
+
" confusion_matrix,\n",
|
| 116 |
+
" classification_report\n",
|
| 117 |
+
")\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"import joblib\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"# ============================================================\n",
|
| 122 |
+
"# 1) Paths / Inputs\n",
|
| 123 |
+
"# ============================================================\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"BASE_DIR = Path.cwd()\n",
|
| 126 |
+
"\n",
|
| 127 |
+
"DATASET_PATH = BASE_DIR / \"hotel_cancel_model_dataset.csv\"\n",
|
| 128 |
+
"FEATURES_PATH = BASE_DIR / \"features.json\"\n",
|
| 129 |
+
"DATASET_META_PATH = BASE_DIR / \"dataset_meta.json\" # optional\n",
|
| 130 |
+
"\n",
|
| 131 |
+
"if not DATASET_PATH.exists():\n",
|
| 132 |
+
" raise FileNotFoundError(f\"Missing file: {DATASET_PATH}. Run the Data Preparation step first.\")\n",
|
| 133 |
+
"if not FEATURES_PATH.exists():\n",
|
| 134 |
+
" raise FileNotFoundError(f\"Missing file: {FEATURES_PATH}. Run the Data Preparation step first.\")\n",
|
| 135 |
+
"\n",
|
| 136 |
+
"# Output folders for Hugging Face app to display\n",
|
| 137 |
+
"ARTIFACTS_DIR = BASE_DIR / \"artifacts\" / \"py\"\n",
|
| 138 |
+
"FIG_DIR = ARTIFACTS_DIR / \"figures\"\n",
|
| 139 |
+
"METRICS_DIR = ARTIFACTS_DIR / \"metrics\"\n",
|
| 140 |
+
"MODELS_DIR = ARTIFACTS_DIR / \"models\"\n",
|
| 141 |
+
"TABLES_DIR = ARTIFACTS_DIR / \"tables\"\n",
|
| 142 |
+
"\n",
|
| 143 |
+
"for d in [FIG_DIR, METRICS_DIR, MODELS_DIR, TABLES_DIR]:\n",
|
| 144 |
+
" d.mkdir(parents=True, exist_ok=True)\n",
|
| 145 |
+
"\n",
|
| 146 |
+
"# ============================================================\n",
|
| 147 |
+
"# 2) Load dataset + schema\n",
|
| 148 |
+
"# ============================================================\n",
|
| 149 |
+
"\n",
|
| 150 |
+
"df = pd.read_csv(DATASET_PATH)\n",
|
| 151 |
+
"\n",
|
| 152 |
+
"with open(FEATURES_PATH, \"r\", encoding=\"utf-8\") as f:\n",
|
| 153 |
+
" schema = json.load(f)\n",
|
| 154 |
+
"\n",
|
| 155 |
+
"FEATURES = schema[\"features\"]\n",
|
| 156 |
+
"TARGET = schema[\"target\"]\n",
|
| 157 |
+
"SPLIT_COL = schema[\"split_col\"]\n",
|
| 158 |
+
"ID_COL = schema.get(\"id_col\", None)\n",
|
| 159 |
+
"\n",
|
| 160 |
+
"cat_features = schema[\"categorical_features\"]\n",
|
| 161 |
+
"num_features = schema[\"numeric_features\"]\n",
|
| 162 |
+
"\n",
|
| 163 |
+
"print(\"Loaded dataset:\", DATASET_PATH)\n",
|
| 164 |
+
"print(\"Shape:\", df.shape)\n",
|
| 165 |
+
"print(\"Features:\", FEATURES)\n",
|
| 166 |
+
"print(\"Target:\", TARGET)\n",
|
| 167 |
+
"print(\"Split column:\", SPLIT_COL)\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"# Basic validation\n",
|
| 170 |
+
"missing_cols = [c for c in FEATURES + [TARGET, SPLIT_COL] if c not in df.columns]\n",
|
| 171 |
+
"if missing_cols:\n",
|
| 172 |
+
" raise ValueError(f\"Dataset is missing required columns: {missing_cols}\")\n",
|
| 173 |
+
"\n",
|
| 174 |
+
"print(\"\\nSplit distribution:\")\n",
|
| 175 |
+
"print(df[SPLIT_COL].value_counts(dropna=False))\n",
|
| 176 |
+
"print(\"Overall cancel rate:\", round(df[TARGET].mean(), 4))\n",
|
| 177 |
+
"\n",
|
| 178 |
+
"# ============================================================\n",
|
| 179 |
+
"# 3) Train/Test split using the fixed split column\n",
|
| 180 |
+
"# ============================================================\n",
|
| 181 |
+
"\n",
|
| 182 |
+
"train_df = df[df[SPLIT_COL] == \"train\"].copy()\n",
|
| 183 |
+
"test_df = df[df[SPLIT_COL] == \"test\"].copy()\n",
|
| 184 |
+
"\n",
|
| 185 |
+
"if train_df.empty or test_df.empty:\n",
|
| 186 |
+
" raise ValueError(\"Train or test split is empty. Check the 'split' column in the dataset.\")\n",
|
| 187 |
+
"\n",
|
| 188 |
+
"X_train = train_df[FEATURES]\n",
|
| 189 |
+
"y_train = train_df[TARGET].astype(int)\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"X_test = test_df[FEATURES]\n",
|
| 192 |
+
"y_test = test_df[TARGET].astype(int)\n",
|
| 193 |
+
"\n",
|
| 194 |
+
"print(\"\\nTrain rows:\", len(train_df), \"| Test rows:\", len(test_df))\n",
|
| 195 |
+
"print(\"Train cancel rate:\", round(y_train.mean(), 4))\n",
|
| 196 |
+
"print(\"Test cancel rate :\", round(y_test.mean(), 4))\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"# ============================================================\n",
|
| 199 |
+
"# 4) Build preprocessing + model pipeline\n",
|
| 200 |
+
"# ============================================================\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"# Numeric preprocessing: fill missing with median\n",
|
| 203 |
+
"numeric_transformer = Pipeline(steps=[\n",
|
| 204 |
+
" (\"imputer\", SimpleImputer(strategy=\"median\")),\n",
|
| 205 |
+
"])\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"# Categorical preprocessing: fill missing with most frequent, then one-hot encode\n",
|
| 208 |
+
"categorical_transformer = Pipeline(steps=[\n",
|
| 209 |
+
" (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n",
|
| 210 |
+
" (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\")),\n",
|
| 211 |
+
"])\n",
|
| 212 |
+
"\n",
|
| 213 |
+
"preprocess = ColumnTransformer(\n",
|
| 214 |
+
" transformers=[\n",
|
| 215 |
+
" (\"num\", numeric_transformer, num_features),\n",
|
| 216 |
+
" (\"cat\", categorical_transformer, cat_features),\n",
|
| 217 |
+
" ],\n",
|
| 218 |
+
" remainder=\"drop\"\n",
|
| 219 |
+
")\n",
|
| 220 |
+
"\n",
|
| 221 |
+
"# Random Forest model (balanced helps with class imbalance)\n",
|
| 222 |
+
"model = RandomForestClassifier(\n",
|
| 223 |
+
" n_estimators=400,\n",
|
| 224 |
+
" random_state=42,\n",
|
| 225 |
+
" class_weight=\"balanced\",\n",
|
| 226 |
+
" n_jobs=-1\n",
|
| 227 |
+
")\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"clf = Pipeline(steps=[\n",
|
| 230 |
+
" (\"preprocess\", preprocess),\n",
|
| 231 |
+
" (\"model\", model),\n",
|
| 232 |
+
"])\n",
|
| 233 |
+
"\n",
|
| 234 |
+
"# ============================================================\n",
|
| 235 |
+
"# 5) Train\n",
|
| 236 |
+
"# ============================================================\n",
|
| 237 |
+
"\n",
|
| 238 |
+
"clf.fit(X_train, y_train)\n",
|
| 239 |
+
"\n",
|
| 240 |
+
"# ============================================================\n",
|
| 241 |
+
"# 6) Predict + evaluate\n",
|
| 242 |
+
"# ============================================================\n",
|
| 243 |
+
"\n",
|
| 244 |
+
"# Predicted class\n",
|
| 245 |
+
"y_pred = clf.predict(X_test)\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"# Predicted probability for class \"1\" (canceled)\n",
|
| 248 |
+
"y_proba = clf.predict_proba(X_test)[:, 1]\n",
|
| 249 |
+
"\n",
|
| 250 |
+
"acc = accuracy_score(y_test, y_pred)\n",
|
| 251 |
+
"prec = precision_score(y_test, y_pred, zero_division=0)\n",
|
| 252 |
+
"rec = recall_score(y_test, y_pred, zero_division=0)\n",
|
| 253 |
+
"f1 = f1_score(y_test, y_pred, zero_division=0)\n",
|
| 254 |
+
"\n",
|
| 255 |
+
"# AUC requires probabilities\n",
|
| 256 |
+
"auc = roc_auc_score(y_test, y_proba)\n",
|
| 257 |
+
"\n",
|
| 258 |
+
"cm = confusion_matrix(y_test, y_pred)\n",
|
| 259 |
+
"report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)\n",
|
| 260 |
+
"\n",
|
| 261 |
+
"print(\"\\n=== Test Metrics (Random Forest) ===\")\n",
|
| 262 |
+
"print(\"Accuracy:\", round(acc, 4))\n",
|
| 263 |
+
"print(\"Precision:\", round(prec, 4))\n",
|
| 264 |
+
"print(\"Recall:\", round(rec, 4))\n",
|
| 265 |
+
"print(\"F1:\", round(f1, 4))\n",
|
| 266 |
+
"print(\"ROC AUC:\", round(auc, 4))\n",
|
| 267 |
+
"print(\"\\nConfusion Matrix:\\n\", cm)\n",
|
| 268 |
+
"\n",
|
| 269 |
+
"# ============================================================\n",
|
| 270 |
+
"# 7) Save plots for Hugging Face app\n",
|
| 271 |
+
"# ============================================================\n",
|
| 272 |
+
"\n",
|
| 273 |
+
"# 7.1 Confusion matrix plot\n",
|
| 274 |
+
"plt.figure()\n",
|
| 275 |
+
"plt.imshow(cm)\n",
|
| 276 |
+
"plt.title(\"Confusion Matrix (Random Forest)\")\n",
|
| 277 |
+
"plt.xlabel(\"Predicted\")\n",
|
| 278 |
+
"plt.ylabel(\"Actual\")\n",
|
| 279 |
+
"plt.xticks([0, 1], [\"Not canceled (0)\", \"Canceled (1)\"])\n",
|
| 280 |
+
"plt.yticks([0, 1], [\"Not canceled (0)\", \"Canceled (1)\"])\n",
|
| 281 |
+
"\n",
|
| 282 |
+
"# Add numbers on the matrix\n",
|
| 283 |
+
"for (i, j), v in np.ndenumerate(cm):\n",
|
| 284 |
+
" plt.text(j, i, str(v), ha=\"center\", va=\"center\")\n",
|
| 285 |
+
"\n",
|
| 286 |
+
"conf_path = FIG_DIR / \"confusion_matrix.png\"\n",
|
| 287 |
+
"plt.tight_layout()\n",
|
| 288 |
+
"plt.savefig(conf_path, dpi=200)\n",
|
| 289 |
+
"plt.close()\n",
|
| 290 |
+
"print(\"✅ Saved:\", conf_path)\n",
|
| 291 |
+
"\n",
|
| 292 |
+
"# 7.2 ROC curve plot\n",
|
| 293 |
+
"fpr, tpr, thresholds = roc_curve(y_test, y_proba)\n",
|
| 294 |
+
"\n",
|
| 295 |
+
"plt.figure()\n",
|
| 296 |
+
"plt.plot(fpr, tpr)\n",
|
| 297 |
+
"plt.plot([0, 1], [0, 1], linestyle=\"--\")\n",
|
| 298 |
+
"plt.title(\"ROC Curve (Random Forest)\")\n",
|
| 299 |
+
"plt.xlabel(\"False Positive Rate\")\n",
|
| 300 |
+
"plt.ylabel(\"True Positive Rate\")\n",
|
| 301 |
+
"\n",
|
| 302 |
+
"roc_path = FIG_DIR / \"roc_curve.png\"\n",
|
| 303 |
+
"plt.tight_layout()\n",
|
| 304 |
+
"plt.savefig(roc_path, dpi=200)\n",
|
| 305 |
+
"plt.close()\n",
|
| 306 |
+
"print(\"✅ Saved:\", roc_path)\n",
|
| 307 |
+
"\n",
|
| 308 |
+
"# ============================================================\n",
|
| 309 |
+
"# 8) Save metrics + tables for Hugging Face app\n",
|
| 310 |
+
"# ============================================================\n",
|
| 311 |
+
"\n",
|
| 312 |
+
"metrics_payload = {\n",
|
| 313 |
+
" \"model_name\": \"RandomForestClassifier\",\n",
|
| 314 |
+
" \"features_used\": FEATURES,\n",
|
| 315 |
+
" \"n_train\": int(len(train_df)),\n",
|
| 316 |
+
" \"n_test\": int(len(test_df)),\n",
|
| 317 |
+
" \"cancel_rate_train\": float(y_train.mean()),\n",
|
| 318 |
+
" \"cancel_rate_test\": float(y_test.mean()),\n",
|
| 319 |
+
" \"accuracy\": float(acc),\n",
|
| 320 |
+
" \"precision\": float(prec),\n",
|
| 321 |
+
" \"recall\": float(rec),\n",
|
| 322 |
+
" \"f1\": float(f1),\n",
|
| 323 |
+
" \"roc_auc\": float(auc),\n",
|
| 324 |
+
" \"confusion_matrix\": cm.tolist(),\n",
|
| 325 |
+
" \"classification_report\": report, # full dict\n",
|
| 326 |
+
"}\n",
|
| 327 |
+
"\n",
|
| 328 |
+
"metrics_path = METRICS_DIR / \"metrics.json\"\n",
|
| 329 |
+
"with open(metrics_path, \"w\", encoding=\"utf-8\") as f:\n",
|
| 330 |
+
" json.dump(metrics_payload, f, indent=2)\n",
|
| 331 |
+
"print(\"✅ Saved:\", metrics_path)\n",
|
| 332 |
+
"\n",
|
| 333 |
+
"# Save test predictions table (so app can display top risky bookings)\n",
|
| 334 |
+
"pred_df = test_df.copy()\n",
|
| 335 |
+
"pred_df[\"pred_label\"] = y_pred\n",
|
| 336 |
+
"pred_df[\"pred_proba_canceled\"] = y_proba\n",
|
| 337 |
+
"\n",
|
| 338 |
+
"# Keep id if present, else keep the row index\n",
|
| 339 |
+
"cols_to_keep = []\n",
|
| 340 |
+
"if ID_COL and ID_COL in pred_df.columns:\n",
|
| 341 |
+
" cols_to_keep.append(ID_COL)\n",
|
| 342 |
+
"\n",
|
| 343 |
+
"cols_to_keep += FEATURES + [TARGET, \"pred_label\", \"pred_proba_canceled\"]\n",
|
| 344 |
+
"\n",
|
| 345 |
+
"pred_out = pred_df[cols_to_keep].sort_values(\"pred_proba_canceled\", ascending=False)\n",
|
| 346 |
+
"pred_path = TABLES_DIR / \"test_predictions.csv\"\n",
|
| 347 |
+
"pred_out.to_csv(pred_path, index=False)\n",
|
| 348 |
+
"print(\"✅ Saved:\", pred_path)\n",
|
| 349 |
+
"\n",
|
| 350 |
+
"# ============================================================\n",
|
| 351 |
+
"# 9) Feature importances (mapped back to one-hot feature names)\n",
|
| 352 |
+
"# ============================================================\n",
|
| 353 |
+
"\n",
|
| 354 |
+
"# Extract feature names after preprocessing\n",
|
| 355 |
+
"preprocessor = clf.named_steps[\"preprocess\"]\n",
|
| 356 |
+
"ohe = preprocessor.named_transformers_[\"cat\"].named_steps[\"onehot\"]\n",
|
| 357 |
+
"\n",
|
| 358 |
+
"cat_feature_names = list(ohe.get_feature_names_out(cat_features))\n",
|
| 359 |
+
"all_feature_names = num_features + cat_feature_names\n",
|
| 360 |
+
"\n",
|
| 361 |
+
"rf = clf.named_steps[\"model\"]\n",
|
| 362 |
+
"importances = rf.feature_importances_\n",
|
| 363 |
+
"\n",
|
| 364 |
+
"fi = pd.DataFrame({\n",
|
| 365 |
+
" \"feature\": all_feature_names,\n",
|
| 366 |
+
" \"importance\": importances\n",
|
| 367 |
+
"}).sort_values(\"importance\", ascending=False)\n",
|
| 368 |
+
"\n",
|
| 369 |
+
"fi_path = TABLES_DIR / \"feature_importances.csv\"\n",
|
| 370 |
+
"fi.to_csv(fi_path, index=False)\n",
|
| 371 |
+
"print(\"✅ Saved:\", fi_path)\n",
|
| 372 |
+
"\n",
|
| 373 |
+
"# ============================================================\n",
|
| 374 |
+
"# 10) Save trained model for app inference\n",
|
| 375 |
+
"# ============================================================\n",
|
| 376 |
+
"\n",
|
| 377 |
+
"model_path = MODELS_DIR / \"model.joblib\"\n",
|
| 378 |
+
"joblib.dump(clf, model_path)\n",
|
| 379 |
+
"print(\"✅ Saved:\", model_path)\n",
|
| 380 |
+
"\n",
|
| 381 |
+
"# ============================================================\n",
|
| 382 |
+
"# 11) Save an example input row for the app's \"Predict\" tab\n",
|
| 383 |
+
"# ============================================================\n",
|
| 384 |
+
"\n",
|
| 385 |
+
"example_row = train_df[FEATURES].iloc[0].to_dict()\n",
|
| 386 |
+
"example_path = ARTIFACTS_DIR / \"example_input.json\"\n",
|
| 387 |
+
"with open(example_path, \"w\", encoding=\"utf-8\") as f:\n",
|
| 388 |
+
" json.dump(example_row, f, indent=2)\n",
|
| 389 |
+
"print(\"✅ Saved:\", example_path)\n",
|
| 390 |
+
"\n",
|
| 391 |
+
"print(\"\\nAll done. Python artifacts saved to:\", ARTIFACTS_DIR)\n",
|
| 392 |
+
"print(\"Next: update the R notebook to use the same split + features and output artifacts/r/...\")"
|
| 393 |
+
]
|
| 394 |
+
}
|
| 395 |
+
]
|
| 396 |
+
}
|
3_R_Analysis.ipynb
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "ir",
|
| 10 |
+
"display_name": "R"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "R"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 1,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"id": "PjBjk0dxe_u7",
|
| 22 |
+
"colab": {
|
| 23 |
+
"base_uri": "https://localhost:8080/",
|
| 24 |
+
"height": 1000
|
| 25 |
+
},
|
| 26 |
+
"outputId": "26aa869d-f8e4-45fe-a7f5-75e58416d422"
|
| 27 |
+
},
|
| 28 |
+
"outputs": [
|
| 29 |
+
{
|
| 30 |
+
"output_type": "stream",
|
| 31 |
+
"name": "stderr",
|
| 32 |
+
"text": [
|
| 33 |
+
"Installing packages into ‘/usr/local/lib/R/site-library’\n",
|
| 34 |
+
"(as ‘lib’ is unspecified)\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"also installing the dependencies ‘iterators’, ‘foreach’, ‘shape’, ‘RcppEigen’\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"Loading required package: Matrix\n",
|
| 40 |
+
"\n",
|
| 41 |
+
"Loaded glmnet 4.1-10\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"Type 'citation(\"pROC\")' for a citation.\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"Attaching package: ‘pROC’\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"\n",
|
| 49 |
+
"The following objects are masked from ‘package:stats’:\n",
|
| 50 |
+
"\n",
|
| 51 |
+
" cov, smooth, var\n",
|
| 52 |
+
"\n",
|
| 53 |
+
"\n"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"output_type": "stream",
|
| 58 |
+
"name": "stdout",
|
| 59 |
+
"text": [
|
| 60 |
+
"Loaded dataset: /content/hotel_cancel_model_dataset.csv \n",
|
| 61 |
+
"Shape: 36285 rows x 11 cols\n",
|
| 62 |
+
"Target: is_canceled \n",
|
| 63 |
+
"Split column: split \n",
|
| 64 |
+
"Features: lead_time, average_price, total_nights, total_guests, market_segment_type, type_of_meal, special_requests, price_per_guest \n",
|
| 65 |
+
"\n",
|
| 66 |
+
"Split distribution:\n",
|
| 67 |
+
"\n",
|
| 68 |
+
" test train \n",
|
| 69 |
+
" 7257 29028 \n",
|
| 70 |
+
"\n",
|
| 71 |
+
"Overall cancel rate: 0.3276561 \n",
|
| 72 |
+
"\n",
|
| 73 |
+
"Train rows: 29028 | Test rows: 7257 \n",
|
| 74 |
+
"Train cancel rate: 0.3276492 \n",
|
| 75 |
+
"Test cancel rate : 0.3276836 \n",
|
| 76 |
+
"\n",
|
| 77 |
+
"Chosen lambda (1se): 0.01835658 \n",
|
| 78 |
+
"\n",
|
| 79 |
+
"=== Test Metrics (LASSO Logistic Regression) ===\n",
|
| 80 |
+
"Accuracy: 0.7915 \n",
|
| 81 |
+
"Precision: 0.7658 \n",
|
| 82 |
+
"Recall: 0.524 \n",
|
| 83 |
+
"F1: 0.6222 \n",
|
| 84 |
+
"ROC AUC: 0.8511 \n",
|
| 85 |
+
"\n",
|
| 86 |
+
"Confusion Matrix:\n",
|
| 87 |
+
" Pred 0 Pred 1\n",
|
| 88 |
+
"Actual 0 4498 381\n",
|
| 89 |
+
"Actual 1 1132 1246\n"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"output_type": "display_data",
|
| 94 |
+
"data": {
|
| 95 |
+
"text/html": [
|
| 96 |
+
"<strong>agg_record_904241799:</strong> 2"
|
| 97 |
+
],
|
| 98 |
+
"text/markdown": "**agg_record_904241799:** 2",
|
| 99 |
+
"text/latex": "\\textbf{agg\\textbackslash{}\\_record\\textbackslash{}\\_904241799:} 2",
|
| 100 |
+
"text/plain": [
|
| 101 |
+
"agg_record_904241799 \n",
|
| 102 |
+
" 2 "
|
| 103 |
+
]
|
| 104 |
+
},
|
| 105 |
+
"metadata": {}
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"output_type": "stream",
|
| 109 |
+
"name": "stdout",
|
| 110 |
+
"text": [
|
| 111 |
+
"✅ Saved: /content/artifacts/r/figures/roc_curve.png \n",
|
| 112 |
+
"✅ Saved: /content/artifacts/r/metrics/metrics.json \n",
|
| 113 |
+
"✅ Saved: /content/artifacts/r/tables/test_predictions.csv \n",
|
| 114 |
+
"✅ Saved: /content/artifacts/r/tables/coefficients.csv \n",
|
| 115 |
+
"✅ Saved: /content/artifacts/r/models/model.rds \n",
|
| 116 |
+
"✅ Saved: /content/artifacts/r/example_input.json \n",
|
| 117 |
+
"\n",
|
| 118 |
+
"All done. R artifacts saved to: /content/artifacts/r \n",
|
| 119 |
+
"Next: build app.py to run notebooks and display artifacts/py and artifacts/r side-by-side.\n"
|
| 120 |
+
]
|
| 121 |
+
}
|
| 122 |
+
],
|
| 123 |
+
"source": [
|
| 124 |
+
"# ============================================================\n",
|
| 125 |
+
"# Hotel Booking Cancellation - R Model Notebook (Logistic Regression via glmnet LASSO)\n",
|
| 126 |
+
"# ============================================================\n",
|
| 127 |
+
"# This notebook/script:\n",
|
| 128 |
+
"# 1) Loads the modeling dataset created in Data Preparation:\n",
|
| 129 |
+
"# - hotel_cancel_model_dataset.csv\n",
|
| 130 |
+
"# - features.json\n",
|
| 131 |
+
"# - (optionally) dataset_meta.json\n",
|
| 132 |
+
"# 2) Uses the fixed \"split\" column to train on train rows and evaluate on test rows\n",
|
| 133 |
+
"# 3) Trains a LASSO Logistic Regression model (glmnet, alpha=1) with CV\n",
|
| 134 |
+
"# 4) Exports Hugging Face app–ready artifacts to artifacts/r/:\n",
|
| 135 |
+
"# - model.rds\n",
|
| 136 |
+
"# - metrics.json\n",
|
| 137 |
+
"# - roc_curve.png\n",
|
| 138 |
+
"# - coefficients.csv\n",
|
| 139 |
+
"# - test_predictions.csv\n",
|
| 140 |
+
"# - example_input.json (for app testing)\n",
|
| 141 |
+
"# ============================================================\n",
|
| 142 |
+
"\n",
|
| 143 |
+
"# ============================================================\n",
|
| 144 |
+
"# 0) Libraries (install if needed)\n",
|
| 145 |
+
"# ============================================================\n",
|
| 146 |
+
"required_pkgs <- c(\"jsonlite\", \"glmnet\", \"Matrix\", \"pROC\")\n",
|
| 147 |
+
"to_install <- required_pkgs[!required_pkgs %in% rownames(installed.packages())]\n",
|
| 148 |
+
"if (length(to_install) > 0) {\n",
|
| 149 |
+
" install.packages(to_install, repos = \"https://cloud.r-project.org\")\n",
|
| 150 |
+
"}\n",
|
| 151 |
+
"\n",
|
| 152 |
+
"library(jsonlite)\n",
|
| 153 |
+
"library(glmnet)\n",
|
| 154 |
+
"library(Matrix)\n",
|
| 155 |
+
"library(pROC)\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"# ============================================================\n",
|
| 158 |
+
"# 1) Paths / Inputs\n",
|
| 159 |
+
"# ============================================================\n",
|
| 160 |
+
"BASE_DIR <- getwd()\n",
|
| 161 |
+
"\n",
|
| 162 |
+
"DATASET_PATH <- file.path(BASE_DIR, \"hotel_cancel_model_dataset.csv\")\n",
|
| 163 |
+
"FEATURES_PATH <- file.path(BASE_DIR, \"features.json\")\n",
|
| 164 |
+
"\n",
|
| 165 |
+
"if (!file.exists(DATASET_PATH)) stop(paste(\"Missing file:\", DATASET_PATH, \"Run Data Preparation first.\"))\n",
|
| 166 |
+
"if (!file.exists(FEATURES_PATH)) stop(paste(\"Missing file:\", FEATURES_PATH, \"Run Data Preparation first.\"))\n",
|
| 167 |
+
"\n",
|
| 168 |
+
"# Output folders for Hugging Face app to display\n",
|
| 169 |
+
"ARTIFACTS_DIR <- file.path(BASE_DIR, \"artifacts\", \"r\")\n",
|
| 170 |
+
"FIG_DIR <- file.path(ARTIFACTS_DIR, \"figures\")\n",
|
| 171 |
+
"METRICS_DIR <- file.path(ARTIFACTS_DIR, \"metrics\")\n",
|
| 172 |
+
"MODELS_DIR <- file.path(ARTIFACTS_DIR, \"models\")\n",
|
| 173 |
+
"TABLES_DIR <- file.path(ARTIFACTS_DIR, \"tables\")\n",
|
| 174 |
+
"\n",
|
| 175 |
+
"dir.create(FIG_DIR, recursive = TRUE, showWarnings = FALSE)\n",
|
| 176 |
+
"dir.create(METRICS_DIR, recursive = TRUE, showWarnings = FALSE)\n",
|
| 177 |
+
"dir.create(MODELS_DIR, recursive = TRUE, showWarnings = FALSE)\n",
|
| 178 |
+
"dir.create(TABLES_DIR, recursive = TRUE, showWarnings = FALSE)\n",
|
| 179 |
+
"\n",
|
| 180 |
+
"# ============================================================\n",
|
| 181 |
+
"# 2) Load dataset + schema (features.json)\n",
|
| 182 |
+
"# ============================================================\n",
|
| 183 |
+
"df <- read.csv(DATASET_PATH, stringsAsFactors = FALSE)\n",
|
| 184 |
+
"\n",
|
| 185 |
+
"schema <- fromJSON(FEATURES_PATH)\n",
|
| 186 |
+
"FEATURES <- schema$features\n",
|
| 187 |
+
"TARGET <- schema$target\n",
|
| 188 |
+
"SPLIT_COL <- schema$split_col\n",
|
| 189 |
+
"ID_COL <- schema$id_col\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"cat(\"Loaded dataset:\", DATASET_PATH, \"\\n\")\n",
|
| 192 |
+
"cat(\"Shape:\", nrow(df), \"rows x\", ncol(df), \"cols\\n\")\n",
|
| 193 |
+
"cat(\"Target:\", TARGET, \"\\n\")\n",
|
| 194 |
+
"cat(\"Split column:\", SPLIT_COL, \"\\n\")\n",
|
| 195 |
+
"cat(\"Features:\", paste(FEATURES, collapse = \", \"), \"\\n\\n\")\n",
|
| 196 |
+
"\n",
|
| 197 |
+
"# Basic validation\n",
|
| 198 |
+
"missing_cols <- setdiff(c(FEATURES, TARGET, SPLIT_COL), colnames(df))\n",
|
| 199 |
+
"if (length(missing_cols) > 0) {\n",
|
| 200 |
+
" stop(paste(\"Dataset is missing required columns:\", paste(missing_cols, collapse = \", \")))\n",
|
| 201 |
+
"}\n",
|
| 202 |
+
"\n",
|
| 203 |
+
"cat(\"Split distribution:\\n\")\n",
|
| 204 |
+
"print(table(df[[SPLIT_COL]]))\n",
|
| 205 |
+
"cat(\"\\nOverall cancel rate:\", mean(df[[TARGET]]), \"\\n\\n\")\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"# ============================================================\n",
|
| 208 |
+
"# 3) Train/Test split using the fixed split column\n",
|
| 209 |
+
"# ============================================================\n",
|
| 210 |
+
"train_df <- subset(df, df[[SPLIT_COL]] == \"train\")\n",
|
| 211 |
+
"test_df <- subset(df, df[[SPLIT_COL]] == \"test\")\n",
|
| 212 |
+
"\n",
|
| 213 |
+
"if (nrow(train_df) == 0 || nrow(test_df) == 0) {\n",
|
| 214 |
+
" stop(\"Train or test split is empty. Check the 'split' column in the dataset.\")\n",
|
| 215 |
+
"}\n",
|
| 216 |
+
"\n",
|
| 217 |
+
"y_train <- as.integer(train_df[[TARGET]])\n",
|
| 218 |
+
"y_test <- as.integer(test_df[[TARGET]])\n",
|
| 219 |
+
"\n",
|
| 220 |
+
"cat(\"Train rows:\", nrow(train_df), \" | Test rows:\", nrow(test_df), \"\\n\")\n",
|
| 221 |
+
"cat(\"Train cancel rate:\", mean(y_train), \"\\n\")\n",
|
| 222 |
+
"cat(\"Test cancel rate :\", mean(y_test), \"\\n\\n\")\n",
|
| 223 |
+
"\n",
|
| 224 |
+
"# ============================================================\n",
|
| 225 |
+
"# 4) Build design matrices (one-hot encoding) for glmnet\n",
|
| 226 |
+
"# ============================================================\n",
|
| 227 |
+
"# We create a formula: ~ feature1 + feature2 + ...\n",
|
| 228 |
+
"# and use sparse.model.matrix to one-hot encode categoricals.\n",
|
| 229 |
+
"# IMPORTANT: Use exactly the features from features.json.\n",
|
| 230 |
+
"\n",
|
| 231 |
+
"formula_str <- paste(\"~\", paste(FEATURES, collapse = \" + \"))\n",
|
| 232 |
+
"f <- as.formula(formula_str)\n",
|
| 233 |
+
"\n",
|
| 234 |
+
"# glmnet expects numeric matrix; sparse.model.matrix handles factors and produces sparse matrix\n",
|
| 235 |
+
"# Convert characters to factors for correct one-hot encoding\n",
|
| 236 |
+
"train_x_df <- train_df[, FEATURES, drop = FALSE]\n",
|
| 237 |
+
"test_x_df <- test_df[, FEATURES, drop = FALSE]\n",
|
| 238 |
+
"\n",
|
| 239 |
+
"# Ensure consistent factor levels between train and test:\n",
|
| 240 |
+
"# - Convert character columns to factor using combined data levels\n",
|
| 241 |
+
"for (col in FEATURES) {\n",
|
| 242 |
+
" if (is.character(train_x_df[[col]]) || is.character(test_x_df[[col]])) {\n",
|
| 243 |
+
" all_levels <- unique(c(train_x_df[[col]], test_x_df[[col]]))\n",
|
| 244 |
+
" train_x_df[[col]] <- factor(train_x_df[[col]], levels = all_levels)\n",
|
| 245 |
+
" test_x_df[[col]] <- factor(test_x_df[[col]], levels = all_levels)\n",
|
| 246 |
+
" }\n",
|
| 247 |
+
"}\n",
|
| 248 |
+
"\n",
|
| 249 |
+
"X_train <- sparse.model.matrix(f, data = train_x_df)[, -1, drop = FALSE] # drop intercept column\n",
|
| 250 |
+
"X_test <- sparse.model.matrix(f, data = test_x_df)[, -1, drop = FALSE]\n",
|
| 251 |
+
"\n",
|
| 252 |
+
"# ============================================================\n",
|
| 253 |
+
"# 5) Train LASSO Logistic Regression with cross-validation\n",
|
| 254 |
+
"# ============================================================\n",
|
| 255 |
+
"set.seed(42)\n",
|
| 256 |
+
"\n",
|
| 257 |
+
"# alpha = 1 => LASSO, family = \"binomial\" => logistic regression\n",
|
| 258 |
+
"cv_fit <- cv.glmnet(\n",
|
| 259 |
+
" x = X_train,\n",
|
| 260 |
+
" y = y_train,\n",
|
| 261 |
+
" family = \"binomial\",\n",
|
| 262 |
+
" alpha = 1,\n",
|
| 263 |
+
" nfolds = 5,\n",
|
| 264 |
+
" type.measure = \"auc\" # optimize CV for AUC (good for imbalanced classification)\n",
|
| 265 |
+
")\n",
|
| 266 |
+
"\n",
|
| 267 |
+
"best_lambda <- cv_fit$lambda.1se # more regularized (safer); alternatively lambda.min\n",
|
| 268 |
+
"cat(\"Chosen lambda (1se):\", best_lambda, \"\\n\\n\")\n",
|
| 269 |
+
"\n",
|
| 270 |
+
"# ============================================================\n",
|
| 271 |
+
"# 6) Predict + evaluate on test set\n",
|
| 272 |
+
"# ============================================================\n",
|
| 273 |
+
"# Predicted probabilities for class 1 (canceled)\n",
|
| 274 |
+
"proba_test <- as.numeric(predict(cv_fit, newx = X_test, s = best_lambda, type = \"response\"))\n",
|
| 275 |
+
"\n",
|
| 276 |
+
"# Convert to class labels (threshold 0.5)\n",
|
| 277 |
+
"pred_test <- ifelse(proba_test >= 0.5, 1L, 0L)\n",
|
| 278 |
+
"\n",
|
| 279 |
+
"# Metrics\n",
|
| 280 |
+
"accuracy <- mean(pred_test == y_test)\n",
|
| 281 |
+
"\n",
|
| 282 |
+
"# Precision / Recall / F1 (manual)\n",
|
| 283 |
+
"tp <- sum(pred_test == 1 & y_test == 1)\n",
|
| 284 |
+
"tn <- sum(pred_test == 0 & y_test == 0)\n",
|
| 285 |
+
"fp <- sum(pred_test == 1 & y_test == 0)\n",
|
| 286 |
+
"fn <- sum(pred_test == 0 & y_test == 1)\n",
|
| 287 |
+
"\n",
|
| 288 |
+
"precision <- ifelse((tp + fp) > 0, tp / (tp + fp), 0)\n",
|
| 289 |
+
"recall <- ifelse((tp + fn) > 0, tp / (tp + fn), 0)\n",
|
| 290 |
+
"f1 <- ifelse((precision + recall) > 0, 2 * precision * recall / (precision + recall), 0)\n",
|
| 291 |
+
"\n",
|
| 292 |
+
"# ROC AUC\n",
|
| 293 |
+
"roc_obj <- pROC::roc(response = y_test, predictor = proba_test, quiet = TRUE)\n",
|
| 294 |
+
"auc <- as.numeric(pROC::auc(roc_obj))\n",
|
| 295 |
+
"\n",
|
| 296 |
+
"conf_mat <- matrix(c(tn, fp, fn, tp), nrow = 2, byrow = TRUE)\n",
|
| 297 |
+
"colnames(conf_mat) <- c(\"Pred 0\", \"Pred 1\")\n",
|
| 298 |
+
"rownames(conf_mat) <- c(\"Actual 0\", \"Actual 1\")\n",
|
| 299 |
+
"\n",
|
| 300 |
+
"cat(\"=== Test Metrics (LASSO Logistic Regression) ===\\n\")\n",
|
| 301 |
+
"cat(\"Accuracy:\", round(accuracy, 4), \"\\n\")\n",
|
| 302 |
+
"cat(\"Precision:\", round(precision, 4), \"\\n\")\n",
|
| 303 |
+
"cat(\"Recall:\", round(recall, 4), \"\\n\")\n",
|
| 304 |
+
"cat(\"F1:\", round(f1, 4), \"\\n\")\n",
|
| 305 |
+
"cat(\"ROC AUC:\", round(auc, 4), \"\\n\\n\")\n",
|
| 306 |
+
"cat(\"Confusion Matrix:\\n\")\n",
|
| 307 |
+
"print(conf_mat)\n",
|
| 308 |
+
"\n",
|
| 309 |
+
"# ============================================================\n",
|
| 310 |
+
"# 7) Save ROC curve plot (for Hugging Face app)\n",
|
| 311 |
+
"# ============================================================\n",
|
| 312 |
+
"roc_path <- file.path(FIG_DIR, \"roc_curve.png\")\n",
|
| 313 |
+
"png(filename = roc_path, width = 900, height = 700)\n",
|
| 314 |
+
"plot(roc_obj, main = \"ROC Curve (LASSO Logistic Regression)\", col = \"blue\", lwd = 2)\n",
|
| 315 |
+
"abline(a = 0, b = 1, lty = 2, col = \"gray40\")\n",
|
| 316 |
+
"dev.off()\n",
|
| 317 |
+
"cat(\"✅ Saved:\", roc_path, \"\\n\")\n",
|
| 318 |
+
"\n",
|
| 319 |
+
"# ============================================================\n",
|
| 320 |
+
"# 8) Save metrics.json (for Hugging Face app)\n",
|
| 321 |
+
"# ============================================================\n",
|
| 322 |
+
"metrics_payload <- list(\n",
|
| 323 |
+
" model_name = \"glmnet_lasso_logistic\",\n",
|
| 324 |
+
" features_used = FEATURES,\n",
|
| 325 |
+
" lambda_1se = best_lambda,\n",
|
| 326 |
+
" n_train = nrow(train_df),\n",
|
| 327 |
+
" n_test = nrow(test_df),\n",
|
| 328 |
+
" cancel_rate_train = mean(y_train),\n",
|
| 329 |
+
" cancel_rate_test = mean(y_test),\n",
|
| 330 |
+
" accuracy = accuracy,\n",
|
| 331 |
+
" precision = precision,\n",
|
| 332 |
+
" recall = recall,\n",
|
| 333 |
+
" f1 = f1,\n",
|
| 334 |
+
" roc_auc = auc,\n",
|
| 335 |
+
" confusion_matrix = list(\n",
|
| 336 |
+
" tn = tn, fp = fp, fn = fn, tp = tp\n",
|
| 337 |
+
" )\n",
|
| 338 |
+
")\n",
|
| 339 |
+
"\n",
|
| 340 |
+
"metrics_path <- file.path(METRICS_DIR, \"metrics.json\")\n",
|
| 341 |
+
"writeLines(jsonlite::toJSON(metrics_payload, pretty = TRUE, auto_unbox = TRUE), con = metrics_path)\n",
|
| 342 |
+
"cat(\"✅ Saved:\", metrics_path, \"\\n\")\n",
|
| 343 |
+
"\n",
|
| 344 |
+
"# ============================================================\n",
|
| 345 |
+
"# 9) Save test predictions table (for app display)\n",
|
| 346 |
+
"# ============================================================\n",
|
| 347 |
+
"pred_out <- test_df[, c(if (!is.null(ID_COL) && ID_COL %in% colnames(test_df)) ID_COL else NULL, FEATURES, TARGET), drop = FALSE]\n",
|
| 348 |
+
"pred_out$pred_label <- pred_test\n",
|
| 349 |
+
"pred_out$pred_proba_canceled <- proba_test\n",
|
| 350 |
+
"\n",
|
| 351 |
+
"# Sort by highest risk\n",
|
| 352 |
+
"pred_out <- pred_out[order(-pred_out$pred_proba_canceled), ]\n",
|
| 353 |
+
"\n",
|
| 354 |
+
"pred_path <- file.path(TABLES_DIR, \"test_predictions.csv\")\n",
|
| 355 |
+
"write.csv(pred_out, pred_path, row.names = FALSE)\n",
|
| 356 |
+
"cat(\"✅ Saved:\", pred_path, \"\\n\")\n",
|
| 357 |
+
"\n",
|
| 358 |
+
"# ============================================================\n",
|
| 359 |
+
"# 10) Save coefficients (feature importance for logistic regression)\n",
|
| 360 |
+
"# ============================================================\n",
|
| 361 |
+
"# Coefficients at selected lambda\n",
|
| 362 |
+
"coef_mat <- as.matrix(coef(cv_fit, s = best_lambda))\n",
|
| 363 |
+
"coef_df <- data.frame(\n",
|
| 364 |
+
" feature = rownames(coef_mat),\n",
|
| 365 |
+
" coefficient = as.numeric(coef_mat[, 1]),\n",
|
| 366 |
+
" stringsAsFactors = FALSE\n",
|
| 367 |
+
")\n",
|
| 368 |
+
"\n",
|
| 369 |
+
"# Remove intercept and sort by absolute magnitude\n",
|
| 370 |
+
"coef_df <- subset(coef_df, feature != \"(Intercept)\")\n",
|
| 371 |
+
"coef_df$abs_coeff <- abs(coef_df$coefficient)\n",
|
| 372 |
+
"coef_df <- coef_df[order(-coef_df$abs_coeff), ]\n",
|
| 373 |
+
"coef_df$abs_coeff <- NULL\n",
|
| 374 |
+
"\n",
|
| 375 |
+
"coef_path <- file.path(TABLES_DIR, \"coefficients.csv\")\n",
|
| 376 |
+
"write.csv(coef_df, coef_path, row.names = FALSE)\n",
|
| 377 |
+
"cat(\"✅ Saved:\", coef_path, \"\\n\")\n",
|
| 378 |
+
"\n",
|
| 379 |
+
"# ============================================================\n",
|
| 380 |
+
"# 11) Save trained model (RDS) for potential inference\n",
|
| 381 |
+
"# ============================================================\n",
|
| 382 |
+
"model_path <- file.path(MODELS_DIR, \"model.rds\")\n",
|
| 383 |
+
"saveRDS(cv_fit, model_path)\n",
|
| 384 |
+
"cat(\"✅ Saved:\", model_path, \"\\n\")\n",
|
| 385 |
+
"\n",
|
| 386 |
+
"# ============================================================\n",
|
| 387 |
+
"# 12) Save an example input row for the app's \"Predict\" tab\n",
|
| 388 |
+
"# ============================================================\n",
|
| 389 |
+
"example_row <- as.list(train_df[1, FEATURES, drop = FALSE])\n",
|
| 390 |
+
"example_path <- file.path(ARTIFACTS_DIR, \"example_input.json\")\n",
|
| 391 |
+
"writeLines(jsonlite::toJSON(example_row, pretty = TRUE, auto_unbox = TRUE), con = example_path)\n",
|
| 392 |
+
"cat(\"✅ Saved:\", example_path, \"\\n\\n\")\n",
|
| 393 |
+
"\n",
|
| 394 |
+
"cat(\"All done. R artifacts saved to:\", ARTIFACTS_DIR, \"\\n\")\n",
|
| 395 |
+
"cat(\"Next: build app.py to run notebooks and display artifacts/py and artifacts/r side-by-side.\\n\")"
|
| 396 |
+
]
|
| 397 |
+
}
|
| 398 |
+
]
|
| 399 |
+
}
|
booking.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|