Python_Project_2

Sleeping

App Files Files Community

rohan965 commited on 22 days ago

Commit

5f338ab

verified ·

1 Parent(s): 3aabff9

Upload 4 files

Browse files

Files changed (4) hide show

1_Data_Creation.ipynb +288 -0
2_Python_Analysis.ipynb +396 -0
3_R_Analysis.ipynb +399 -0
booking.csv +0 -0

1_Data_Creation.ipynb ADDED Viewed

	@@ -0,0 +1,288 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "OnYkKEUJVrSc",
+        "outputId": "cd7f7280-d1b1-479d-94ac-a14fb1615d28"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Raw shape: (36285, 17)\n",
+            "Raw columns: ['Booking_ID', 'number of adults', 'number of children', 'number of weekend nights', 'number of week nights', 'type of meal', 'car parking space', 'room type', 'lead time', 'market segment type', 'repeated', 'P-C', 'P-not-C', 'average price', 'special requests', 'date of reservation', 'booking status']\n",
+            "\n",
+            "Columns after renaming: ['booking_id', 'number_of_adults', 'number_of_children', 'number_of_weekend_nights', 'number_of_week_nights', 'type_of_meal', 'car_parking_space', 'room_type', 'lead_time', 'market_segment_type', 'repeated', 'p_c', 'p_not_c', 'average_price', 'special_requests', 'date_of_reservation', 'booking_status']\n",
+            "\n",
+            "Cleaned shape: (36285, 18)\n",
+            "Overall cancel rate: 0.3277\n",
+            "\n",
+            "Split distribution:\n",
+            "split\n",
+            "train    29028\n",
+            "test      7257\n",
+            "Name: count, dtype: int64\n",
+            "Train cancel rate: 0.3276\n",
+            "Test cancel rate : 0.3277\n",
+            "\n",
+            "✅ Saved: hotel_cancel_model_dataset.csv | shape=(36285, 11)\n",
+            "✅ Saved: train.csv, test.csv\n",
+            "✅ Saved: features.json\n",
+            "✅ Saved: dataset_meta.json\n",
+            "\n",
+            "Data preparation completed successfully.\n"
+          ]
+        }
+      ],
+      "source": [
+        "# ============================================================\n",
+        "# Hotel Booking Cancellation - Data Preparation Notebook\n",
+        "# ============================================================\n",
+        "# This notebook:\n",
+        "#   1) Loads booking.csv (from HF repo root)\n",
+        "#   2) Cleans and standardizes columns\n",
+        "#   3) Engineers EXACTLY 8 modeling features\n",
+        "#   4) Creates target variable (is_canceled)\n",
+        "#   5) Creates fixed stratified train/test split column\n",
+        "#   6) Exports dataset + metadata for Python & R notebooks\n",
+        "# ============================================================\n",
+        "\n",
+        "import json\n",
+        "import hashlib\n",
+        "from pathlib import Path\n",
+        "\n",
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "\n",
+        "# ============================================================\n",
+        "# 1) Load raw dataset (booking.csv must be in repo root)\n",
+        "# ============================================================\n",
+        "\n",
+        "BASE_DIR = Path.cwd()\n",
+        "FILE_PATH = BASE_DIR / \"booking.csv\"\n",
+        "\n",
+        "if not FILE_PATH.exists():\n",
+        "    raise FileNotFoundError(\n",
+        "        f\"booking.csv not found in {BASE_DIR}. \"\n",
+        "        \"Make sure the file is uploaded to the Hugging Face repo root.\"\n",
+        "    )\n",
+        "\n",
+        "df_raw = pd.read_csv(FILE_PATH)\n",
+        "\n",
+        "print(\"Raw shape:\", df_raw.shape)\n",
+        "print(\"Raw columns:\", df_raw.columns.tolist())\n",
+        "\n",
+        "# ============================================================\n",
+        "# 2) Standardize column names to snake_case\n",
+        "# ============================================================\n",
+        "\n",
+        "def to_snake(s: str) -> str:\n",
+        "    s = str(s).strip().lower()\n",
+        "    s = s.replace(\"-\", \"_\").replace(\" \", \"_\")\n",
+        "    s = \"\".join([c if (c.isalnum() or c == \"_\") else \"_\" for c in s])\n",
+        "    s = \"_\".join([x for x in s.split(\"_\") if x])\n",
+        "    return s\n",
+        "\n",
+        "df = df_raw.copy()\n",
+        "df.columns = [to_snake(c) for c in df.columns]\n",
+        "\n",
+        "print(\"\\nColumns after renaming:\", df.columns.tolist())\n",
+        "\n",
+        "# ============================================================\n",
+        "# 3) Basic cleaning\n",
+        "# ============================================================\n",
+        "\n",
+        "# Numeric columns present in this Kaggle dataset\n",
+        "numeric_cols = [\n",
+        "    \"number_of_adults\",\n",
+        "    \"number_of_children\",\n",
+        "    \"number_of_weekend_nights\",\n",
+        "    \"number_of_week_nights\",\n",
+        "    \"lead_time\",\n",
+        "    \"average_price\",\n",
+        "    \"special_requests\",\n",
+        "    \"car_parking_space\",\n",
+        "    \"repeated\",\n",
+        "]\n",
+        "\n",
+        "categorical_cols = [\n",
+        "    \"market_segment_type\",\n",
+        "    \"type_of_meal\",\n",
+        "    \"room_type\",  # not used in final model but cleaned\n",
+        "]\n",
+        "\n",
+        "# Convert numeric columns\n",
+        "for c in numeric_cols:\n",
+        "    if c in df.columns:\n",
+        "        df[c] = pd.to_numeric(df[c], errors=\"coerce\")\n",
+        "\n",
+        "# Clean categorical columns\n",
+        "for c in categorical_cols:\n",
+        "    if c in df.columns:\n",
+        "        df[c] = (\n",
+        "            df[c]\n",
+        "            .astype(str)\n",
+        "            .str.strip()\n",
+        "            .replace({\"nan\": \"unknown\", \"none\": \"unknown\", \"\": \"unknown\"})\n",
+        "            .fillna(\"unknown\")\n",
+        "        )\n",
+        "\n",
+        "# Create target variable\n",
+        "if \"booking_status\" not in df.columns:\n",
+        "    raise ValueError(\"Expected column 'booking_status' not found.\")\n",
+        "\n",
+        "df[\"booking_status\"] = df[\"booking_status\"].astype(str).str.strip()\n",
+        "df[\"is_canceled\"] = (df[\"booking_status\"].str.lower() == \"canceled\").astype(int)\n",
+        "\n",
+        "# Handle missing numeric values using median\n",
+        "for c in numeric_cols:\n",
+        "    if c in df.columns:\n",
+        "        df[c] = df[c].fillna(df[c].median())\n",
+        "\n",
+        "print(\"\\nCleaned shape:\", df.shape)\n",
+        "print(\"Overall cancel rate:\", round(df[\"is_canceled\"].mean(), 4))\n",
+        "\n",
+        "# ============================================================\n",
+        "# 4) Feature Engineering (FINAL 8 FEATURES)\n",
+        "# ============================================================\n",
+        "\n",
+        "# 4.1 Total nights\n",
+        "df[\"total_nights\"] = (\n",
+        "    df[\"number_of_weekend_nights\"] + df[\"number_of_week_nights\"]\n",
+        ")\n",
+        "\n",
+        "# 4.2 Total guests\n",
+        "df[\"total_guests\"] = (\n",
+        "    df[\"number_of_adults\"] + df[\"number_of_children\"]\n",
+        ")\n",
+        "\n",
+        "# 4.3 Price per guest (avoid division by zero)\n",
+        "denom = np.where(df[\"total_guests\"] > 0, df[\"total_guests\"], 1)\n",
+        "df[\"price_per_guest\"] = df[\"average_price\"] / denom\n",
+        "\n",
+        "# ============================================================\n",
+        "# 5) Select EXACT 8 features\n",
+        "# ============================================================\n",
+        "\n",
+        "FINAL_FEATURES = [\n",
+        "    \"lead_time\",\n",
+        "    \"average_price\",\n",
+        "    \"total_nights\",\n",
+        "    \"total_guests\",\n",
+        "    \"market_segment_type\",\n",
+        "    \"type_of_meal\",\n",
+        "    \"special_requests\",\n",
+        "    \"price_per_guest\",\n",
+        "]\n",
+        "\n",
+        "missing = [c for c in FINAL_FEATURES if c not in df.columns]\n",
+        "if missing:\n",
+        "    raise ValueError(f\"Missing required feature columns: {missing}\")\n",
+        "\n",
+        "id_col = \"booking_id\" if \"booking_id\" in df.columns else None\n",
+        "\n",
+        "export_cols = ([id_col] if id_col else []) + FINAL_FEATURES + [\"is_canceled\"]\n",
+        "df_model = df[export_cols].copy()\n",
+        "\n",
+        "# ============================================================\n",
+        "# 6) Create FIXED stratified train/test split\n",
+        "# ============================================================\n",
+        "\n",
+        "rng = np.random.default_rng(42)\n",
+        "df_model[\"split\"] = \"train\"\n",
+        "\n",
+        "for label in [0, 1]:\n",
+        "    idx = df_model.index[df_model[\"is_canceled\"] == label].to_numpy()\n",
+        "    rng.shuffle(idx)\n",
+        "    test_size = int(round(0.2 * len(idx)))\n",
+        "    test_idx = idx[:test_size]\n",
+        "    df_model.loc[test_idx, \"split\"] = \"test\"\n",
+        "\n",
+        "print(\"\\nSplit distribution:\")\n",
+        "print(df_model[\"split\"].value_counts())\n",
+        "print(\"Train cancel rate:\", round(df_model[df_model[\"split\"] == \"train\"][\"is_canceled\"].mean(), 4))\n",
+        "print(\"Test cancel rate :\", round(df_model[df_model[\"split\"] == \"test\"][\"is_canceled\"].mean(), 4))\n",
+        "\n",
+        "# ============================================================\n",
+        "# 7) Export files for Python + R notebooks + HF app\n",
+        "# ============================================================\n",
+        "\n",
+        "OUT_DATASET = \"hotel_cancel_model_dataset.csv\"\n",
+        "df_model.to_csv(OUT_DATASET, index=False, encoding=\"utf-8\")\n",
+        "print(f\"\\n✅ Saved: {OUT_DATASET} | shape={df_model.shape}\")\n",
+        "\n",
+        "# Optional convenience splits\n",
+        "df_model[df_model[\"split\"] == \"train\"].to_csv(\"train.csv\", index=False)\n",
+        "df_model[df_model[\"split\"] == \"test\"].to_csv(\"test.csv\", index=False)\n",
+        "print(\"✅ Saved: train.csv, test.csv\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 8) Export feature schema (for Python model + app)\n",
+        "# ============================================================\n",
+        "\n",
+        "features_payload = {\n",
+        "    \"id_col\": id_col,\n",
+        "    \"target\": \"is_canceled\",\n",
+        "    \"split_col\": \"split\",\n",
+        "    \"features\": FINAL_FEATURES,\n",
+        "    \"categorical_features\": [\"market_segment_type\", \"type_of_meal\"],\n",
+        "    \"numeric_features\": [\n",
+        "        \"lead_time\",\n",
+        "        \"average_price\",\n",
+        "        \"total_nights\",\n",
+        "        \"total_guests\",\n",
+        "        \"special_requests\",\n",
+        "        \"price_per_guest\",\n",
+        "    ],\n",
+        "}\n",
+        "\n",
+        "with open(\"features.json\", \"w\", encoding=\"utf-8\") as f:\n",
+        "    json.dump(features_payload, f, indent=2)\n",
+        "\n",
+        "print(\"✅ Saved: features.json\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 9) Dataset fingerprint (reproducibility)\n",
+        "# ============================================================\n",
+        "\n",
+        "with open(OUT_DATASET, \"rb\") as f:\n",
+        "    md5 = hashlib.md5(f.read()).hexdigest()\n",
+        "\n",
+        "dataset_meta = {\n",
+        "    \"dataset_file\": OUT_DATASET,\n",
+        "    \"md5\": md5,\n",
+        "    \"rows\": int(df_model.shape[0]),\n",
+        "    \"cols\": int(df_model.shape[1]),\n",
+        "    \"cancel_rate_overall\": float(df_model[\"is_canceled\"].mean()),\n",
+        "}\n",
+        "\n",
+        "with open(\"dataset_meta.json\", \"w\", encoding=\"utf-8\") as f:\n",
+        "    json.dump(dataset_meta, f, indent=2)\n",
+        "\n",
+        "print(\"✅ Saved: dataset_meta.json\")\n",
+        "\n",
+        "print(\"\\nData preparation completed successfully.\")"
+      ]
+    }
+  ]
+}

2_Python_Analysis.ipynb ADDED Viewed

	@@ -0,0 +1,396 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "3F3JK2ZTayYg",
+        "outputId": "a724f9a3-5603-4db5-9e38-095323467abe"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Loaded dataset: /content/hotel_cancel_model_dataset.csv\n",
+            "Shape: (36285, 11)\n",
+            "Features: ['lead_time', 'average_price', 'total_nights', 'total_guests', 'market_segment_type', 'type_of_meal', 'special_requests', 'price_per_guest']\n",
+            "Target: is_canceled\n",
+            "Split column: split\n",
+            "\n",
+            "Split distribution:\n",
+            "split\n",
+            "train    29028\n",
+            "test      7257\n",
+            "Name: count, dtype: int64\n",
+            "Overall cancel rate: 0.3277\n",
+            "\n",
+            "Train rows: 29028 | Test rows: 7257\n",
+            "Train cancel rate: 0.3276\n",
+            "Test cancel rate : 0.3277\n",
+            "\n",
+            "=== Test Metrics (Random Forest) ===\n",
+            "Accuracy: 0.8794\n",
+            "Precision: 0.8417\n",
+            "Recall: 0.7784\n",
+            "F1: 0.8088\n",
+            "ROC AUC: 0.9382\n",
+            "\n",
+            "Confusion Matrix:\n",
+            " [[4531  348]\n",
+            " [ 527 1851]]\n",
+            "✅ Saved: /content/artifacts/py/figures/confusion_matrix.png\n",
+            "✅ Saved: /content/artifacts/py/figures/roc_curve.png\n",
+            "✅ Saved: /content/artifacts/py/metrics/metrics.json\n",
+            "✅ Saved: /content/artifacts/py/tables/test_predictions.csv\n",
+            "✅ Saved: /content/artifacts/py/tables/feature_importances.csv\n",
+            "✅ Saved: /content/artifacts/py/models/model.joblib\n",
+            "✅ Saved: /content/artifacts/py/example_input.json\n",
+            "\n",
+            "All done. Python artifacts saved to: /content/artifacts/py\n",
+            "Next: update the R notebook to use the same split + features and output artifacts/r/...\n"
+          ]
+        }
+      ],
+      "source": [
+        "# ============================================================\n",
+        "# Hotel Booking Cancellation - Python Model Notebook (Random Forest)\n",
+        "# ============================================================\n",
+        "# This notebook/script:\n",
+        "#   1) Loads the modeling dataset created in the Data Preparation step:\n",
+        "#        - hotel_cancel_model_dataset.csv\n",
+        "#        - features.json\n",
+        "#        - (optionally) dataset_meta.json\n",
+        "#   2) Uses the fixed \"split\" column to train on train rows and evaluate on test rows\n",
+        "#   3) Trains a RandomForest model inside a single sklearn Pipeline:\n",
+        "#        - Numeric: median imputation\n",
+        "#        - Categorical: most_frequent imputation + OneHotEncode\n",
+        "#   4) Exports Hugging Face app–ready artifacts to artifacts/py/:\n",
+        "#        - model.joblib\n",
+        "#        - metrics.json\n",
+        "#        - confusion_matrix.png\n",
+        "#        - roc_curve.png\n",
+        "#        - feature_importances.csv\n",
+        "#        - test_predictions.csv\n",
+        "#        - example_input.json\n",
+        "# ============================================================\n",
+        "\n",
+        "import json\n",
+        "from pathlib import Path\n",
+        "\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "from sklearn.compose import ColumnTransformer\n",
+        "from sklearn.impute import SimpleImputer\n",
+        "from sklearn.pipeline import Pipeline\n",
+        "from sklearn.preprocessing import OneHotEncoder\n",
+        "from sklearn.ensemble import RandomForestClassifier\n",
+        "\n",
+        "from sklearn.metrics import (\n",
+        "    accuracy_score,\n",
+        "    precision_score,\n",
+        "    recall_score,\n",
+        "    f1_score,\n",
+        "    roc_auc_score,\n",
+        "    roc_curve,\n",
+        "    confusion_matrix,\n",
+        "    classification_report\n",
+        ")\n",
+        "\n",
+        "import joblib\n",
+        "\n",
+        "# ============================================================\n",
+        "# 1) Paths / Inputs\n",
+        "# ============================================================\n",
+        "\n",
+        "BASE_DIR = Path.cwd()\n",
+        "\n",
+        "DATASET_PATH = BASE_DIR / \"hotel_cancel_model_dataset.csv\"\n",
+        "FEATURES_PATH = BASE_DIR / \"features.json\"\n",
+        "DATASET_META_PATH = BASE_DIR / \"dataset_meta.json\"  # optional\n",
+        "\n",
+        "if not DATASET_PATH.exists():\n",
+        "    raise FileNotFoundError(f\"Missing file: {DATASET_PATH}. Run the Data Preparation step first.\")\n",
+        "if not FEATURES_PATH.exists():\n",
+        "    raise FileNotFoundError(f\"Missing file: {FEATURES_PATH}. Run the Data Preparation step first.\")\n",
+        "\n",
+        "# Output folders for Hugging Face app to display\n",
+        "ARTIFACTS_DIR = BASE_DIR / \"artifacts\" / \"py\"\n",
+        "FIG_DIR = ARTIFACTS_DIR / \"figures\"\n",
+        "METRICS_DIR = ARTIFACTS_DIR / \"metrics\"\n",
+        "MODELS_DIR = ARTIFACTS_DIR / \"models\"\n",
+        "TABLES_DIR = ARTIFACTS_DIR / \"tables\"\n",
+        "\n",
+        "for d in [FIG_DIR, METRICS_DIR, MODELS_DIR, TABLES_DIR]:\n",
+        "    d.mkdir(parents=True, exist_ok=True)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 2) Load dataset + schema\n",
+        "# ============================================================\n",
+        "\n",
+        "df = pd.read_csv(DATASET_PATH)\n",
+        "\n",
+        "with open(FEATURES_PATH, \"r\", encoding=\"utf-8\") as f:\n",
+        "    schema = json.load(f)\n",
+        "\n",
+        "FEATURES = schema[\"features\"]\n",
+        "TARGET = schema[\"target\"]\n",
+        "SPLIT_COL = schema[\"split_col\"]\n",
+        "ID_COL = schema.get(\"id_col\", None)\n",
+        "\n",
+        "cat_features = schema[\"categorical_features\"]\n",
+        "num_features = schema[\"numeric_features\"]\n",
+        "\n",
+        "print(\"Loaded dataset:\", DATASET_PATH)\n",
+        "print(\"Shape:\", df.shape)\n",
+        "print(\"Features:\", FEATURES)\n",
+        "print(\"Target:\", TARGET)\n",
+        "print(\"Split column:\", SPLIT_COL)\n",
+        "\n",
+        "# Basic validation\n",
+        "missing_cols = [c for c in FEATURES + [TARGET, SPLIT_COL] if c not in df.columns]\n",
+        "if missing_cols:\n",
+        "    raise ValueError(f\"Dataset is missing required columns: {missing_cols}\")\n",
+        "\n",
+        "print(\"\\nSplit distribution:\")\n",
+        "print(df[SPLIT_COL].value_counts(dropna=False))\n",
+        "print(\"Overall cancel rate:\", round(df[TARGET].mean(), 4))\n",
+        "\n",
+        "# ============================================================\n",
+        "# 3) Train/Test split using the fixed split column\n",
+        "# ============================================================\n",
+        "\n",
+        "train_df = df[df[SPLIT_COL] == \"train\"].copy()\n",
+        "test_df  = df[df[SPLIT_COL] == \"test\"].copy()\n",
+        "\n",
+        "if train_df.empty or test_df.empty:\n",
+        "    raise ValueError(\"Train or test split is empty. Check the 'split' column in the dataset.\")\n",
+        "\n",
+        "X_train = train_df[FEATURES]\n",
+        "y_train = train_df[TARGET].astype(int)\n",
+        "\n",
+        "X_test = test_df[FEATURES]\n",
+        "y_test = test_df[TARGET].astype(int)\n",
+        "\n",
+        "print(\"\\nTrain rows:\", len(train_df), \"| Test rows:\", len(test_df))\n",
+        "print(\"Train cancel rate:\", round(y_train.mean(), 4))\n",
+        "print(\"Test cancel rate :\", round(y_test.mean(), 4))\n",
+        "\n",
+        "# ============================================================\n",
+        "# 4) Build preprocessing + model pipeline\n",
+        "# ============================================================\n",
+        "\n",
+        "# Numeric preprocessing: fill missing with median\n",
+        "numeric_transformer = Pipeline(steps=[\n",
+        "    (\"imputer\", SimpleImputer(strategy=\"median\")),\n",
+        "])\n",
+        "\n",
+        "# Categorical preprocessing: fill missing with most frequent, then one-hot encode\n",
+        "categorical_transformer = Pipeline(steps=[\n",
+        "    (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n",
+        "    (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\")),\n",
+        "])\n",
+        "\n",
+        "preprocess = ColumnTransformer(\n",
+        "    transformers=[\n",
+        "        (\"num\", numeric_transformer, num_features),\n",
+        "        (\"cat\", categorical_transformer, cat_features),\n",
+        "    ],\n",
+        "    remainder=\"drop\"\n",
+        ")\n",
+        "\n",
+        "# Random Forest model (balanced helps with class imbalance)\n",
+        "model = RandomForestClassifier(\n",
+        "    n_estimators=400,\n",
+        "    random_state=42,\n",
+        "    class_weight=\"balanced\",\n",
+        "    n_jobs=-1\n",
+        ")\n",
+        "\n",
+        "clf = Pipeline(steps=[\n",
+        "    (\"preprocess\", preprocess),\n",
+        "    (\"model\", model),\n",
+        "])\n",
+        "\n",
+        "# ============================================================\n",
+        "# 5) Train\n",
+        "# ============================================================\n",
+        "\n",
+        "clf.fit(X_train, y_train)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 6) Predict + evaluate\n",
+        "# ============================================================\n",
+        "\n",
+        "# Predicted class\n",
+        "y_pred = clf.predict(X_test)\n",
+        "\n",
+        "# Predicted probability for class \"1\" (canceled)\n",
+        "y_proba = clf.predict_proba(X_test)[:, 1]\n",
+        "\n",
+        "acc = accuracy_score(y_test, y_pred)\n",
+        "prec = precision_score(y_test, y_pred, zero_division=0)\n",
+        "rec = recall_score(y_test, y_pred, zero_division=0)\n",
+        "f1 = f1_score(y_test, y_pred, zero_division=0)\n",
+        "\n",
+        "# AUC requires probabilities\n",
+        "auc = roc_auc_score(y_test, y_proba)\n",
+        "\n",
+        "cm = confusion_matrix(y_test, y_pred)\n",
+        "report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)\n",
+        "\n",
+        "print(\"\\n=== Test Metrics (Random Forest) ===\")\n",
+        "print(\"Accuracy:\", round(acc, 4))\n",
+        "print(\"Precision:\", round(prec, 4))\n",
+        "print(\"Recall:\", round(rec, 4))\n",
+        "print(\"F1:\", round(f1, 4))\n",
+        "print(\"ROC AUC:\", round(auc, 4))\n",
+        "print(\"\\nConfusion Matrix:\\n\", cm)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 7) Save plots for Hugging Face app\n",
+        "# ============================================================\n",
+        "\n",
+        "# 7.1 Confusion matrix plot\n",
+        "plt.figure()\n",
+        "plt.imshow(cm)\n",
+        "plt.title(\"Confusion Matrix (Random Forest)\")\n",
+        "plt.xlabel(\"Predicted\")\n",
+        "plt.ylabel(\"Actual\")\n",
+        "plt.xticks([0, 1], [\"Not canceled (0)\", \"Canceled (1)\"])\n",
+        "plt.yticks([0, 1], [\"Not canceled (0)\", \"Canceled (1)\"])\n",
+        "\n",
+        "# Add numbers on the matrix\n",
+        "for (i, j), v in np.ndenumerate(cm):\n",
+        "    plt.text(j, i, str(v), ha=\"center\", va=\"center\")\n",
+        "\n",
+        "conf_path = FIG_DIR / \"confusion_matrix.png\"\n",
+        "plt.tight_layout()\n",
+        "plt.savefig(conf_path, dpi=200)\n",
+        "plt.close()\n",
+        "print(\"✅ Saved:\", conf_path)\n",
+        "\n",
+        "# 7.2 ROC curve plot\n",
+        "fpr, tpr, thresholds = roc_curve(y_test, y_proba)\n",
+        "\n",
+        "plt.figure()\n",
+        "plt.plot(fpr, tpr)\n",
+        "plt.plot([0, 1], [0, 1], linestyle=\"--\")\n",
+        "plt.title(\"ROC Curve (Random Forest)\")\n",
+        "plt.xlabel(\"False Positive Rate\")\n",
+        "plt.ylabel(\"True Positive Rate\")\n",
+        "\n",
+        "roc_path = FIG_DIR / \"roc_curve.png\"\n",
+        "plt.tight_layout()\n",
+        "plt.savefig(roc_path, dpi=200)\n",
+        "plt.close()\n",
+        "print(\"✅ Saved:\", roc_path)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 8) Save metrics + tables for Hugging Face app\n",
+        "# ============================================================\n",
+        "\n",
+        "metrics_payload = {\n",
+        "    \"model_name\": \"RandomForestClassifier\",\n",
+        "    \"features_used\": FEATURES,\n",
+        "    \"n_train\": int(len(train_df)),\n",
+        "    \"n_test\": int(len(test_df)),\n",
+        "    \"cancel_rate_train\": float(y_train.mean()),\n",
+        "    \"cancel_rate_test\": float(y_test.mean()),\n",
+        "    \"accuracy\": float(acc),\n",
+        "    \"precision\": float(prec),\n",
+        "    \"recall\": float(rec),\n",
+        "    \"f1\": float(f1),\n",
+        "    \"roc_auc\": float(auc),\n",
+        "    \"confusion_matrix\": cm.tolist(),\n",
+        "    \"classification_report\": report,  # full dict\n",
+        "}\n",
+        "\n",
+        "metrics_path = METRICS_DIR / \"metrics.json\"\n",
+        "with open(metrics_path, \"w\", encoding=\"utf-8\") as f:\n",
+        "    json.dump(metrics_payload, f, indent=2)\n",
+        "print(\"✅ Saved:\", metrics_path)\n",
+        "\n",
+        "# Save test predictions table (so app can display top risky bookings)\n",
+        "pred_df = test_df.copy()\n",
+        "pred_df[\"pred_label\"] = y_pred\n",
+        "pred_df[\"pred_proba_canceled\"] = y_proba\n",
+        "\n",
+        "# Keep id if present, else keep the row index\n",
+        "cols_to_keep = []\n",
+        "if ID_COL and ID_COL in pred_df.columns:\n",
+        "    cols_to_keep.append(ID_COL)\n",
+        "\n",
+        "cols_to_keep += FEATURES + [TARGET, \"pred_label\", \"pred_proba_canceled\"]\n",
+        "\n",
+        "pred_out = pred_df[cols_to_keep].sort_values(\"pred_proba_canceled\", ascending=False)\n",
+        "pred_path = TABLES_DIR / \"test_predictions.csv\"\n",
+        "pred_out.to_csv(pred_path, index=False)\n",
+        "print(\"✅ Saved:\", pred_path)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 9) Feature importances (mapped back to one-hot feature names)\n",
+        "# ============================================================\n",
+        "\n",
+        "# Extract feature names after preprocessing\n",
+        "preprocessor = clf.named_steps[\"preprocess\"]\n",
+        "ohe = preprocessor.named_transformers_[\"cat\"].named_steps[\"onehot\"]\n",
+        "\n",
+        "cat_feature_names = list(ohe.get_feature_names_out(cat_features))\n",
+        "all_feature_names = num_features + cat_feature_names\n",
+        "\n",
+        "rf = clf.named_steps[\"model\"]\n",
+        "importances = rf.feature_importances_\n",
+        "\n",
+        "fi = pd.DataFrame({\n",
+        "    \"feature\": all_feature_names,\n",
+        "    \"importance\": importances\n",
+        "}).sort_values(\"importance\", ascending=False)\n",
+        "\n",
+        "fi_path = TABLES_DIR / \"feature_importances.csv\"\n",
+        "fi.to_csv(fi_path, index=False)\n",
+        "print(\"✅ Saved:\", fi_path)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 10) Save trained model for app inference\n",
+        "# ============================================================\n",
+        "\n",
+        "model_path = MODELS_DIR / \"model.joblib\"\n",
+        "joblib.dump(clf, model_path)\n",
+        "print(\"✅ Saved:\", model_path)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 11) Save an example input row for the app's \"Predict\" tab\n",
+        "# ============================================================\n",
+        "\n",
+        "example_row = train_df[FEATURES].iloc[0].to_dict()\n",
+        "example_path = ARTIFACTS_DIR / \"example_input.json\"\n",
+        "with open(example_path, \"w\", encoding=\"utf-8\") as f:\n",
+        "    json.dump(example_row, f, indent=2)\n",
+        "print(\"✅ Saved:\", example_path)\n",
+        "\n",
+        "print(\"\\nAll done. Python artifacts saved to:\", ARTIFACTS_DIR)\n",
+        "print(\"Next: update the R notebook to use the same split + features and output artifacts/r/...\")"
+      ]
+    }
+  ]
+}

3_R_Analysis.ipynb ADDED Viewed

	@@ -0,0 +1,399 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "ir",
+      "display_name": "R"
+    },
+    "language_info": {
+      "name": "R"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "PjBjk0dxe_u7",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "outputId": "26aa869d-f8e4-45fe-a7f5-75e58416d422"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Installing packages into ‘/usr/local/lib/R/site-library’\n",
+            "(as ‘lib’ is unspecified)\n",
+            "\n",
+            "also installing the dependencies ‘iterators’, ‘foreach’, ‘shape’, ‘RcppEigen’\n",
+            "\n",
+            "\n",
+            "Loading required package: Matrix\n",
+            "\n",
+            "Loaded glmnet 4.1-10\n",
+            "\n",
+            "Type 'citation(\"pROC\")' for a citation.\n",
+            "\n",
+            "\n",
+            "Attaching package: ‘pROC’\n",
+            "\n",
+            "\n",
+            "The following objects are masked from ‘package:stats’:\n",
+            "\n",
+            "    cov, smooth, var\n",
+            "\n",
+            "\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Loaded dataset: /content/hotel_cancel_model_dataset.csv \n",
+            "Shape: 36285 rows x 11 cols\n",
+            "Target: is_canceled \n",
+            "Split column: split \n",
+            "Features: lead_time, average_price, total_nights, total_guests, market_segment_type, type_of_meal, special_requests, price_per_guest \n",
+            "\n",
+            "Split distribution:\n",
+            "\n",
+            " test train \n",
+            " 7257 29028 \n",
+            "\n",
+            "Overall cancel rate: 0.3276561 \n",
+            "\n",
+            "Train rows: 29028  | Test rows: 7257 \n",
+            "Train cancel rate: 0.3276492 \n",
+            "Test cancel rate : 0.3276836 \n",
+            "\n",
+            "Chosen lambda (1se): 0.01835658 \n",
+            "\n",
+            "=== Test Metrics (LASSO Logistic Regression) ===\n",
+            "Accuracy: 0.7915 \n",
+            "Precision: 0.7658 \n",
+            "Recall: 0.524 \n",
+            "F1: 0.6222 \n",
+            "ROC AUC: 0.8511 \n",
+            "\n",
+            "Confusion Matrix:\n",
+            "         Pred 0 Pred 1\n",
+            "Actual 0   4498    381\n",
+            "Actual 1   1132   1246\n"
+          ]
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": [
+              "<strong>agg_record_904241799:</strong> 2"
+            ],
+            "text/markdown": "**agg_record_904241799:** 2",
+            "text/latex": "\\textbf{agg\\textbackslash{}\\_record\\textbackslash{}\\_904241799:} 2",
+            "text/plain": [
+              "agg_record_904241799 \n",
+              "                   2 "
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ Saved: /content/artifacts/r/figures/roc_curve.png \n",
+            "✅ Saved: /content/artifacts/r/metrics/metrics.json \n",
+            "✅ Saved: /content/artifacts/r/tables/test_predictions.csv \n",
+            "✅ Saved: /content/artifacts/r/tables/coefficients.csv \n",
+            "✅ Saved: /content/artifacts/r/models/model.rds \n",
+            "✅ Saved: /content/artifacts/r/example_input.json \n",
+            "\n",
+            "All done. R artifacts saved to: /content/artifacts/r \n",
+            "Next: build app.py to run notebooks and display artifacts/py and artifacts/r side-by-side.\n"
+          ]
+        }
+      ],
+      "source": [
+        "# ============================================================\n",
+        "# Hotel Booking Cancellation - R Model Notebook (Logistic Regression via glmnet LASSO)\n",
+        "# ============================================================\n",
+        "# This notebook/script:\n",
+        "#   1) Loads the modeling dataset created in Data Preparation:\n",
+        "#        - hotel_cancel_model_dataset.csv\n",
+        "#        - features.json\n",
+        "#        - (optionally) dataset_meta.json\n",
+        "#   2) Uses the fixed \"split\" column to train on train rows and evaluate on test rows\n",
+        "#   3) Trains a LASSO Logistic Regression model (glmnet, alpha=1) with CV\n",
+        "#   4) Exports Hugging Face app–ready artifacts to artifacts/r/:\n",
+        "#        - model.rds\n",
+        "#        - metrics.json\n",
+        "#        - roc_curve.png\n",
+        "#        - coefficients.csv\n",
+        "#        - test_predictions.csv\n",
+        "#        - example_input.json (for app testing)\n",
+        "# ============================================================\n",
+        "\n",
+        "# ============================================================\n",
+        "# 0) Libraries (install if needed)\n",
+        "# ============================================================\n",
+        "required_pkgs <- c(\"jsonlite\", \"glmnet\", \"Matrix\", \"pROC\")\n",
+        "to_install <- required_pkgs[!required_pkgs %in% rownames(installed.packages())]\n",
+        "if (length(to_install) > 0) {\n",
+        "  install.packages(to_install, repos = \"https://cloud.r-project.org\")\n",
+        "}\n",
+        "\n",
+        "library(jsonlite)\n",
+        "library(glmnet)\n",
+        "library(Matrix)\n",
+        "library(pROC)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 1) Paths / Inputs\n",
+        "# ============================================================\n",
+        "BASE_DIR <- getwd()\n",
+        "\n",
+        "DATASET_PATH <- file.path(BASE_DIR, \"hotel_cancel_model_dataset.csv\")\n",
+        "FEATURES_PATH <- file.path(BASE_DIR, \"features.json\")\n",
+        "\n",
+        "if (!file.exists(DATASET_PATH)) stop(paste(\"Missing file:\", DATASET_PATH, \"Run Data Preparation first.\"))\n",
+        "if (!file.exists(FEATURES_PATH)) stop(paste(\"Missing file:\", FEATURES_PATH, \"Run Data Preparation first.\"))\n",
+        "\n",
+        "# Output folders for Hugging Face app to display\n",
+        "ARTIFACTS_DIR <- file.path(BASE_DIR, \"artifacts\", \"r\")\n",
+        "FIG_DIR <- file.path(ARTIFACTS_DIR, \"figures\")\n",
+        "METRICS_DIR <- file.path(ARTIFACTS_DIR, \"metrics\")\n",
+        "MODELS_DIR <- file.path(ARTIFACTS_DIR, \"models\")\n",
+        "TABLES_DIR <- file.path(ARTIFACTS_DIR, \"tables\")\n",
+        "\n",
+        "dir.create(FIG_DIR, recursive = TRUE, showWarnings = FALSE)\n",
+        "dir.create(METRICS_DIR, recursive = TRUE, showWarnings = FALSE)\n",
+        "dir.create(MODELS_DIR, recursive = TRUE, showWarnings = FALSE)\n",
+        "dir.create(TABLES_DIR, recursive = TRUE, showWarnings = FALSE)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 2) Load dataset + schema (features.json)\n",
+        "# ============================================================\n",
+        "df <- read.csv(DATASET_PATH, stringsAsFactors = FALSE)\n",
+        "\n",
+        "schema <- fromJSON(FEATURES_PATH)\n",
+        "FEATURES <- schema$features\n",
+        "TARGET <- schema$target\n",
+        "SPLIT_COL <- schema$split_col\n",
+        "ID_COL <- schema$id_col\n",
+        "\n",
+        "cat(\"Loaded dataset:\", DATASET_PATH, \"\\n\")\n",
+        "cat(\"Shape:\", nrow(df), \"rows x\", ncol(df), \"cols\\n\")\n",
+        "cat(\"Target:\", TARGET, \"\\n\")\n",
+        "cat(\"Split column:\", SPLIT_COL, \"\\n\")\n",
+        "cat(\"Features:\", paste(FEATURES, collapse = \", \"), \"\\n\\n\")\n",
+        "\n",
+        "# Basic validation\n",
+        "missing_cols <- setdiff(c(FEATURES, TARGET, SPLIT_COL), colnames(df))\n",
+        "if (length(missing_cols) > 0) {\n",
+        "  stop(paste(\"Dataset is missing required columns:\", paste(missing_cols, collapse = \", \")))\n",
+        "}\n",
+        "\n",
+        "cat(\"Split distribution:\\n\")\n",
+        "print(table(df[[SPLIT_COL]]))\n",
+        "cat(\"\\nOverall cancel rate:\", mean(df[[TARGET]]), \"\\n\\n\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 3) Train/Test split using the fixed split column\n",
+        "# ============================================================\n",
+        "train_df <- subset(df, df[[SPLIT_COL]] == \"train\")\n",
+        "test_df  <- subset(df, df[[SPLIT_COL]] == \"test\")\n",
+        "\n",
+        "if (nrow(train_df) == 0 || nrow(test_df) == 0) {\n",
+        "  stop(\"Train or test split is empty. Check the 'split' column in the dataset.\")\n",
+        "}\n",
+        "\n",
+        "y_train <- as.integer(train_df[[TARGET]])\n",
+        "y_test  <- as.integer(test_df[[TARGET]])\n",
+        "\n",
+        "cat(\"Train rows:\", nrow(train_df), \" | Test rows:\", nrow(test_df), \"\\n\")\n",
+        "cat(\"Train cancel rate:\", mean(y_train), \"\\n\")\n",
+        "cat(\"Test cancel rate :\", mean(y_test), \"\\n\\n\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 4) Build design matrices (one-hot encoding) for glmnet\n",
+        "# ============================================================\n",
+        "# We create a formula: ~ feature1 + feature2 + ...\n",
+        "# and use sparse.model.matrix to one-hot encode categoricals.\n",
+        "# IMPORTANT: Use exactly the features from features.json.\n",
+        "\n",
+        "formula_str <- paste(\"~\", paste(FEATURES, collapse = \" + \"))\n",
+        "f <- as.formula(formula_str)\n",
+        "\n",
+        "# glmnet expects numeric matrix; sparse.model.matrix handles factors and produces sparse matrix\n",
+        "# Convert characters to factors for correct one-hot encoding\n",
+        "train_x_df <- train_df[, FEATURES, drop = FALSE]\n",
+        "test_x_df  <- test_df[, FEATURES, drop = FALSE]\n",
+        "\n",
+        "# Ensure consistent factor levels between train and test:\n",
+        "# - Convert character columns to factor using combined data levels\n",
+        "for (col in FEATURES) {\n",
+        "  if (is.character(train_x_df[[col]]) || is.character(test_x_df[[col]])) {\n",
+        "    all_levels <- unique(c(train_x_df[[col]], test_x_df[[col]]))\n",
+        "    train_x_df[[col]] <- factor(train_x_df[[col]], levels = all_levels)\n",
+        "    test_x_df[[col]]  <- factor(test_x_df[[col]],  levels = all_levels)\n",
+        "  }\n",
+        "}\n",
+        "\n",
+        "X_train <- sparse.model.matrix(f, data = train_x_df)[, -1, drop = FALSE]  # drop intercept column\n",
+        "X_test  <- sparse.model.matrix(f, data = test_x_df)[, -1, drop = FALSE]\n",
+        "\n",
+        "# ============================================================\n",
+        "# 5) Train LASSO Logistic Regression with cross-validation\n",
+        "# ============================================================\n",
+        "set.seed(42)\n",
+        "\n",
+        "# alpha = 1 => LASSO, family = \"binomial\" => logistic regression\n",
+        "cv_fit <- cv.glmnet(\n",
+        "  x = X_train,\n",
+        "  y = y_train,\n",
+        "  family = \"binomial\",\n",
+        "  alpha = 1,\n",
+        "  nfolds = 5,\n",
+        "  type.measure = \"auc\"  # optimize CV for AUC (good for imbalanced classification)\n",
+        ")\n",
+        "\n",
+        "best_lambda <- cv_fit$lambda.1se  # more regularized (safer); alternatively lambda.min\n",
+        "cat(\"Chosen lambda (1se):\", best_lambda, \"\\n\\n\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 6) Predict + evaluate on test set\n",
+        "# ============================================================\n",
+        "# Predicted probabilities for class 1 (canceled)\n",
+        "proba_test <- as.numeric(predict(cv_fit, newx = X_test, s = best_lambda, type = \"response\"))\n",
+        "\n",
+        "# Convert to class labels (threshold 0.5)\n",
+        "pred_test <- ifelse(proba_test >= 0.5, 1L, 0L)\n",
+        "\n",
+        "# Metrics\n",
+        "accuracy <- mean(pred_test == y_test)\n",
+        "\n",
+        "# Precision / Recall / F1 (manual)\n",
+        "tp <- sum(pred_test == 1 & y_test == 1)\n",
+        "tn <- sum(pred_test == 0 & y_test == 0)\n",
+        "fp <- sum(pred_test == 1 & y_test == 0)\n",
+        "fn <- sum(pred_test == 0 & y_test == 1)\n",
+        "\n",
+        "precision <- ifelse((tp + fp) > 0, tp / (tp + fp), 0)\n",
+        "recall    <- ifelse((tp + fn) > 0, tp / (tp + fn), 0)\n",
+        "f1        <- ifelse((precision + recall) > 0, 2 * precision * recall / (precision + recall), 0)\n",
+        "\n",
+        "# ROC AUC\n",
+        "roc_obj <- pROC::roc(response = y_test, predictor = proba_test, quiet = TRUE)\n",
+        "auc <- as.numeric(pROC::auc(roc_obj))\n",
+        "\n",
+        "conf_mat <- matrix(c(tn, fp, fn, tp), nrow = 2, byrow = TRUE)\n",
+        "colnames(conf_mat) <- c(\"Pred 0\", \"Pred 1\")\n",
+        "rownames(conf_mat) <- c(\"Actual 0\", \"Actual 1\")\n",
+        "\n",
+        "cat(\"=== Test Metrics (LASSO Logistic Regression) ===\\n\")\n",
+        "cat(\"Accuracy:\", round(accuracy, 4), \"\\n\")\n",
+        "cat(\"Precision:\", round(precision, 4), \"\\n\")\n",
+        "cat(\"Recall:\", round(recall, 4), \"\\n\")\n",
+        "cat(\"F1:\", round(f1, 4), \"\\n\")\n",
+        "cat(\"ROC AUC:\", round(auc, 4), \"\\n\\n\")\n",
+        "cat(\"Confusion Matrix:\\n\")\n",
+        "print(conf_mat)\n",
+        "\n",
+        "# ============================================================\n",
+        "# 7) Save ROC curve plot (for Hugging Face app)\n",
+        "# ============================================================\n",
+        "roc_path <- file.path(FIG_DIR, \"roc_curve.png\")\n",
+        "png(filename = roc_path, width = 900, height = 700)\n",
+        "plot(roc_obj, main = \"ROC Curve (LASSO Logistic Regression)\", col = \"blue\", lwd = 2)\n",
+        "abline(a = 0, b = 1, lty = 2, col = \"gray40\")\n",
+        "dev.off()\n",
+        "cat(\"✅ Saved:\", roc_path, \"\\n\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 8) Save metrics.json (for Hugging Face app)\n",
+        "# ============================================================\n",
+        "metrics_payload <- list(\n",
+        "  model_name = \"glmnet_lasso_logistic\",\n",
+        "  features_used = FEATURES,\n",
+        "  lambda_1se = best_lambda,\n",
+        "  n_train = nrow(train_df),\n",
+        "  n_test = nrow(test_df),\n",
+        "  cancel_rate_train = mean(y_train),\n",
+        "  cancel_rate_test = mean(y_test),\n",
+        "  accuracy = accuracy,\n",
+        "  precision = precision,\n",
+        "  recall = recall,\n",
+        "  f1 = f1,\n",
+        "  roc_auc = auc,\n",
+        "  confusion_matrix = list(\n",
+        "    tn = tn, fp = fp, fn = fn, tp = tp\n",
+        "  )\n",
+        ")\n",
+        "\n",
+        "metrics_path <- file.path(METRICS_DIR, \"metrics.json\")\n",
+        "writeLines(jsonlite::toJSON(metrics_payload, pretty = TRUE, auto_unbox = TRUE), con = metrics_path)\n",
+        "cat(\"✅ Saved:\", metrics_path, \"\\n\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 9) Save test predictions table (for app display)\n",
+        "# ============================================================\n",
+        "pred_out <- test_df[, c(if (!is.null(ID_COL) && ID_COL %in% colnames(test_df)) ID_COL else NULL, FEATURES, TARGET), drop = FALSE]\n",
+        "pred_out$pred_label <- pred_test\n",
+        "pred_out$pred_proba_canceled <- proba_test\n",
+        "\n",
+        "# Sort by highest risk\n",
+        "pred_out <- pred_out[order(-pred_out$pred_proba_canceled), ]\n",
+        "\n",
+        "pred_path <- file.path(TABLES_DIR, \"test_predictions.csv\")\n",
+        "write.csv(pred_out, pred_path, row.names = FALSE)\n",
+        "cat(\"✅ Saved:\", pred_path, \"\\n\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 10) Save coefficients (feature importance for logistic regression)\n",
+        "# ============================================================\n",
+        "# Coefficients at selected lambda\n",
+        "coef_mat <- as.matrix(coef(cv_fit, s = best_lambda))\n",
+        "coef_df <- data.frame(\n",
+        "  feature = rownames(coef_mat),\n",
+        "  coefficient = as.numeric(coef_mat[, 1]),\n",
+        "  stringsAsFactors = FALSE\n",
+        ")\n",
+        "\n",
+        "# Remove intercept and sort by absolute magnitude\n",
+        "coef_df <- subset(coef_df, feature != \"(Intercept)\")\n",
+        "coef_df$abs_coeff <- abs(coef_df$coefficient)\n",
+        "coef_df <- coef_df[order(-coef_df$abs_coeff), ]\n",
+        "coef_df$abs_coeff <- NULL\n",
+        "\n",
+        "coef_path <- file.path(TABLES_DIR, \"coefficients.csv\")\n",
+        "write.csv(coef_df, coef_path, row.names = FALSE)\n",
+        "cat(\"✅ Saved:\", coef_path, \"\\n\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 11) Save trained model (RDS) for potential inference\n",
+        "# ============================================================\n",
+        "model_path <- file.path(MODELS_DIR, \"model.rds\")\n",
+        "saveRDS(cv_fit, model_path)\n",
+        "cat(\"✅ Saved:\", model_path, \"\\n\")\n",
+        "\n",
+        "# ============================================================\n",
+        "# 12) Save an example input row for the app's \"Predict\" tab\n",
+        "# ============================================================\n",
+        "example_row <- as.list(train_df[1, FEATURES, drop = FALSE])\n",
+        "example_path <- file.path(ARTIFACTS_DIR, \"example_input.json\")\n",
+        "writeLines(jsonlite::toJSON(example_row, pretty = TRUE, auto_unbox = TRUE), con = example_path)\n",
+        "cat(\"✅ Saved:\", example_path, \"\\n\\n\")\n",
+        "\n",
+        "cat(\"All done. R artifacts saved to:\", ARTIFACTS_DIR, \"\\n\")\n",
+        "cat(\"Next: build app.py to run notebooks and display artifacts/py and artifacts/r side-by-side.\\n\")"
+      ]
+    }
+  ]
+}

booking.csv ADDED Viewed

The diff for this file is too large to render. See raw diff