{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test MSE:  1.9502589114484195\n",
      "0.9252566690286127\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import HistGradientBoostingRegressor\n",
    "from sklearn.metrics import mean_squared_error,r2_score\n",
    "import pickle\n",
    "\n",
    "# Load the dataset from CSV\n",
    "selected_columns = ['Process type', 'Part Od', 'Part ID', 'Part Width', 'Finish Wt', 'Input Weight']\n",
    "data = pd.read_csv('/Users/abhijay/Desktop/EY_INTERNSHIP/Data_4.csv', usecols=selected_columns)\n",
    "\n",
    "# Assuming 'Input Weight' is the target variable, and other columns are features\n",
    "X = data.drop(columns=['Input Weight'])\n",
    "y = data['Input Weight']\n",
    "\n",
    "# Split the data into training and testing sets (80% train, 20% test)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from scipy.stats import uniform, randint\n",
    "\n",
    "# Define the parameter grid\n",
    "param_dist = {\n",
    "    'learning_rate': uniform(0.01, 0.2),\n",
    "    'max_iter': randint(100, 1000),\n",
    "    'max_leaf_nodes': randint(10, 50),\n",
    "    'max_depth': randint(3, 10),\n",
    "    'min_samples_leaf': randint(1, 20),\n",
    "    'max_bins': randint(50, 256),\n",
    "    'l2_regularization': uniform(0, 1),\n",
    "    'early_stopping': [True, False]\n",
    "}\n",
    "\n",
    "# Instantiate the model\n",
    "model = HistGradientBoostingRegressor()\n",
    "\n",
    "# Instantiate the RandomizedSearchCV object\n",
    "random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)\n",
    "\n",
    "# Fit the random search model\n",
    "random_search.fit(X_train, y_train)\n",
    "\n",
    "# Get the best estimator\n",
    "best_model = random_search.best_estimator_\n",
    "\n",
    "# Use the best model to predict on the test set\n",
    "y_pred = best_model.predict(X_test)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "from sklearn.metrics import mean_squared_error\n",
    "mse_test = mean_squared_error(y_test, y_pred)\n",
    "r_square=r2_score(y_test, y_pred)\n",
    "print(\"Test MSE: \", mse_test)\n",
    "print(r_square)\n",
    "\n",
    "\n",
    "with open('best_model.pkl', 'wb') as model_file:\n",
    "    pickle.dump(best_model, model_file)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}