{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "# Importing The data and preprocessing" ], "metadata": { "id": "o01mOtABchVv" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "7vSnAq8auv2a" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n" ] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OS_yd77xmvau", "outputId": "5829d46d-c92c-48be-e610-56c311bb9b84" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "code", "source": [ "data=pd.read_excel('/content/drive/MyDrive/Dataset/Dataset.xlsx')\n", "data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 617 }, "id": "5oWpS54uh6rS", "outputId": "c3b9eff7-2587-43f7-afdf-ca2cf1d9dc5e" }, "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Airline Date_of_Journey Source Destination Route \\\n", "0 IndiGo 24/03/2019 Banglore New Delhi BLR → DEL \n", "1 Air India 1/05/2019 Kolkata Banglore CCU → IXR → BBI → BLR \n", "2 Jet Airways 9/06/2019 Delhi Cochin DEL → LKO → BOM → COK \n", "3 IndiGo 12/05/2019 Kolkata Banglore CCU → NAG → BLR \n", "4 IndiGo 01/03/2019 Banglore New Delhi BLR → NAG → DEL \n", "\n", " Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price \n", "0 22:20 01:10 22 Mar 2h 50m non-stop No info 3897 \n", "1 05:50 13:15 7h 25m 2 stops No info 7662 \n", "2 09:25 04:25 10 Jun 19h 2 stops No info 13882 \n", "3 18:05 23:30 5h 25m 1 stop No info 6218 \n", "4 16:50 21:35 4h 45m 1 stop No info 13302 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AirlineDate_of_JourneySourceDestinationRouteDep_TimeArrival_TimeDurationTotal_StopsAdditional_InfoPrice
0IndiGo24/03/2019BangloreNew DelhiBLR → DEL22:2001:10 22 Mar2h 50mnon-stopNo info3897
1Air India1/05/2019KolkataBangloreCCU → IXR → BBI → BLR05:5013:157h 25m2 stopsNo info7662
2Jet Airways9/06/2019DelhiCochinDEL → LKO → BOM → COK09:2504:25 10 Jun19h2 stopsNo info13882
3IndiGo12/05/2019KolkataBangloreCCU → NAG → BLR18:0523:305h 25m1 stopNo info6218
4IndiGo01/03/2019BangloreNew DelhiBLR → NAG → DEL16:5021:354h 45m1 stopNo info13302
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 3 } ] }, { "cell_type": "code", "source": [ "data.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AxYcGBitiOhK", "outputId": "3f1ec570-385a-47f9-ec71-0fae8e83d462" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 10683 entries, 0 to 10682\n", "Data columns (total 11 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Airline 10683 non-null object\n", " 1 Date_of_Journey 10683 non-null object\n", " 2 Source 10683 non-null object\n", " 3 Destination 10683 non-null object\n", " 4 Route 10682 non-null object\n", " 5 Dep_Time 10683 non-null object\n", " 6 Arrival_Time 10683 non-null object\n", " 7 Duration 10683 non-null object\n", " 8 Total_Stops 10682 non-null object\n", " 9 Additional_Info 10683 non-null object\n", " 10 Price 10683 non-null int64 \n", "dtypes: int64(1), object(10)\n", "memory usage: 918.2+ KB\n" ] } ] }, { "cell_type": "markdown", "source": [ "There are 9 features and 1 target variable" ], "metadata": { "id": "LBzdTbn9iWLS" } }, { "cell_type": "code", "source": [ "#checking for null values if any\n", "data.isnull().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uo4I7LuYicK3", "outputId": "6652d6f0-a5cb-4095-e922-6437cd380d3c" }, "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Airline 0\n", "Date_of_Journey 0\n", "Source 0\n", "Destination 0\n", "Route 1\n", "Dep_Time 0\n", "Arrival_Time 0\n", "Duration 0\n", "Total_Stops 1\n", "Additional_Info 0\n", "Price 0\n", "dtype: int64" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "markdown", "source": [ "These empty values are quite low so dropping them" ], "metadata": { "id": "DzvpZwSPippy" } }, { "cell_type": "code", "source": [ "data.dropna(inplace = True)" ], "metadata": { "id": "qtlxM9lQim3f" }, "execution_count": 6, "outputs": [] }, { "cell_type": "markdown", "source": [ "Dropping duplicates rows if any" ], "metadata": { "id": "Mp_kwLGijKG1" } }, { "cell_type": "code", "source": [ "data.drop_duplicates(inplace=True)" ], "metadata": { "id": "ngYaw3Ghi4e_" }, "execution_count": 7, "outputs": [] }, { "cell_type": "markdown", "source": [ "Visualization" ], "metadata": { "id": "3qbV8qRBjcmP" } }, { "cell_type": "markdown", "source": [ "Converting the date and times" ], "metadata": { "id": "AVSeONgRu87n" } }, { "cell_type": "code", "source": [ "data[\"Journey_day\"] = pd.to_datetime(data.Date_of_Journey, format=\"%d/%m/%Y\").dt.day" ], "metadata": { "id": "ppBzqcpaku-f" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "data[\"Journey_month\"] = pd.to_datetime(data[\"Date_of_Journey\"], format = \"%d/%m/%Y\").dt.month" ], "metadata": { "id": "Vky6Dh1Rup7c" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 617 }, "id": "NUk0waDwu4K0", "outputId": "9c31c825-badf-4444-8491-a5cd0b8b0526" }, "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Airline Date_of_Journey Source Destination Route \\\n", "0 IndiGo 24/03/2019 Banglore New Delhi BLR → DEL \n", "1 Air India 1/05/2019 Kolkata Banglore CCU → IXR → BBI → BLR \n", "2 Jet Airways 9/06/2019 Delhi Cochin DEL → LKO → BOM → COK \n", "3 IndiGo 12/05/2019 Kolkata Banglore CCU → NAG → BLR \n", "4 IndiGo 01/03/2019 Banglore New Delhi BLR → NAG → DEL \n", "\n", " Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price \\\n", "0 22:20 01:10 22 Mar 2h 50m non-stop No info 3897 \n", "1 05:50 13:15 7h 25m 2 stops No info 7662 \n", "2 09:25 04:25 10 Jun 19h 2 stops No info 13882 \n", "3 18:05 23:30 5h 25m 1 stop No info 6218 \n", "4 16:50 21:35 4h 45m 1 stop No info 13302 \n", "\n", " Journey_day Journey_month \n", "0 24 3 \n", "1 1 5 \n", "2 9 6 \n", "3 12 5 \n", "4 1 3 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AirlineDate_of_JourneySourceDestinationRouteDep_TimeArrival_TimeDurationTotal_StopsAdditional_InfoPriceJourney_dayJourney_month
0IndiGo24/03/2019BangloreNew DelhiBLR → DEL22:2001:10 22 Mar2h 50mnon-stopNo info3897243
1Air India1/05/2019KolkataBangloreCCU → IXR → BBI → BLR05:5013:157h 25m2 stopsNo info766215
2Jet Airways9/06/2019DelhiCochinDEL → LKO → BOM → COK09:2504:25 10 Jun19h2 stopsNo info1388296
3IndiGo12/05/2019KolkataBangloreCCU → NAG → BLR18:0523:305h 25m1 stopNo info6218125
4IndiGo01/03/2019BangloreNew DelhiBLR → NAG → DEL16:5021:354h 45m1 stopNo info1330213
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "#DAte 0f journey is now of no use so dropping it\n", "data.drop([\"Date_of_Journey\"], axis = 1, inplace = True)" ], "metadata": { "id": "oOguMzM-vB7j" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "#converting the dep time into hours and minutes\n", "data['Dep_hour']=pd.to_datetime(data[\"Dep_Time\"]).dt.hour\n", "#extracting the minutes\n", "data['Dep_min']=pd.to_datetime(data[\"Dep_Time\"]).dt.minute\n", "\n", "#Now we can drop the date time and it is not of use\n", "data.drop([\"Dep_Time\"], axis = 1, inplace = True)" ], "metadata": { "id": "iOOO2UravW6A" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 617 }, "id": "cG2GyHzjwodK", "outputId": "301bb964-ef22-447b-9fca-05309ecd46a2" }, "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Airline Source Destination Route Arrival_Time \\\n", "0 IndiGo Banglore New Delhi BLR → DEL 01:10 22 Mar \n", "1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 13:15 \n", "2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 04:25 10 Jun \n", "3 IndiGo Kolkata Banglore CCU → NAG → BLR 23:30 \n", "4 IndiGo Banglore New Delhi BLR → NAG → DEL 21:35 \n", "\n", " Duration Total_Stops Additional_Info Price Journey_day Journey_month \\\n", "0 2h 50m non-stop No info 3897 24 3 \n", "1 7h 25m 2 stops No info 7662 1 5 \n", "2 19h 2 stops No info 13882 9 6 \n", "3 5h 25m 1 stop No info 6218 12 5 \n", "4 4h 45m 1 stop No info 13302 1 3 \n", "\n", " Dep_hour Dep_min \n", "0 22 20 \n", "1 5 50 \n", "2 9 25 \n", "3 18 5 \n", "4 16 50 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AirlineSourceDestinationRouteArrival_TimeDurationTotal_StopsAdditional_InfoPriceJourney_dayJourney_monthDep_hourDep_min
0IndiGoBangloreNew DelhiBLR → DEL01:10 22 Mar2h 50mnon-stopNo info38972432220
1Air IndiaKolkataBangloreCCU → IXR → BBI → BLR13:157h 25m2 stopsNo info766215550
2Jet AirwaysDelhiCochinDEL → LKO → BOM → COK04:25 10 Jun19h2 stopsNo info1388296925
3IndiGoKolkataBangloreCCU → NAG → BLR23:305h 25m1 stopNo info6218125185
4IndiGoBangloreNew DelhiBLR → NAG → DEL21:354h 45m1 stopNo info13302131650
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "# Arrival time is when the plane pulls up to the gate.\n", "# Similar to Date_of_Journey we can extract values from Arrival_Time\n", "\n", "# Extracting Hours\n", "data[\"Arrival_hour\"] = pd.to_datetime(data.Arrival_Time).dt.hour\n", "\n", "# Extracting Minutes\n", "data[\"Arrival_min\"] = pd.to_datetime(data.Arrival_Time).dt.minute\n", "\n", "# Now we can drop Arrival_Time as it is of no use\n", "data.drop([\"Arrival_Time\"], axis = 1, inplace = True)" ], "metadata": { "id": "RCErZ0iywqnc" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 617 }, "id": "Utiri2DpxN7n", "outputId": "f5c101a6-99a0-403b-e058-87d05be900fd" }, "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Airline Source Destination Route Duration \\\n", "0 IndiGo Banglore New Delhi BLR → DEL 2h 50m \n", "1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 7h 25m \n", "2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 19h \n", "3 IndiGo Kolkata Banglore CCU → NAG → BLR 5h 25m \n", "4 IndiGo Banglore New Delhi BLR → NAG → DEL 4h 45m \n", "\n", " Total_Stops Additional_Info Price Journey_day Journey_month Dep_hour \\\n", "0 non-stop No info 3897 24 3 22 \n", "1 2 stops No info 7662 1 5 5 \n", "2 2 stops No info 13882 9 6 9 \n", "3 1 stop No info 6218 12 5 18 \n", "4 1 stop No info 13302 1 3 16 \n", "\n", " Dep_min Arrival_hour Arrival_min \n", "0 20 1 10 \n", "1 50 13 15 \n", "2 25 4 25 \n", "3 5 23 30 \n", "4 50 21 35 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AirlineSourceDestinationRouteDurationTotal_StopsAdditional_InfoPriceJourney_dayJourney_monthDep_hourDep_minArrival_hourArrival_min
0IndiGoBangloreNew DelhiBLR → DEL2h 50mnon-stopNo info38972432220110
1Air IndiaKolkataBangloreCCU → IXR → BBI → BLR7h 25m2 stopsNo info7662155501315
2Jet AirwaysDelhiCochinDEL → LKO → BOM → COK19h2 stopsNo info1388296925425
3IndiGoKolkataBangloreCCU → NAG → BLR5h 25m1 stopNo info62181251852330
4IndiGoBangloreNew DelhiBLR → NAG → DEL4h 45m1 stopNo info133021316502135
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "# Assigning and converting Duration column into list\n", "duration = list(data[\"Duration\"])" ], "metadata": { "id": "v68C1uRvxQVy" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "#taking the durations\n", "for i in range(len(duration)):\n", " if len(duration[i].split()) != 2:\n", " if \"h\" in duration[i]:\n", " duration[i] = duration[i].strip() + \" 0m\"\n", " else:\n", " duration[i] = \"0h \" + duration[i]\n", "\n", "duration_hours = []\n", "duration_mins = []\n", "for i in range(len(duration)):\n", " duration_hours.append(int(duration[i].split(sep = \"h\")[0]))\n", " duration_mins.append(int(duration[i].split(sep = \"m\")[0].split()[-1]))" ], "metadata": { "id": "IlREQHKcxpX1" }, "execution_count": 17, "outputs": [] }, { "cell_type": "code", "source": [ "data[\"Duration_hours\"] = duration_hours\n", "data[\"Duration_mins\"] = duration_mins" ], "metadata": { "id": "w_SyRaE-x6js" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "data.drop([\"Duration\"], axis = 1, inplace = True)" ], "metadata": { "id": "7bm_D-L4yARV" }, "execution_count": 19, "outputs": [] }, { "cell_type": "code", "source": [ "data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 617 }, "id": "552zYEn4yVTx", "outputId": "95bca8bd-efa9-475f-c368-ea5c85b176be" }, "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Airline Source Destination Route Total_Stops \\\n", "0 IndiGo Banglore New Delhi BLR → DEL non-stop \n", "1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 2 stops \n", "2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 2 stops \n", "3 IndiGo Kolkata Banglore CCU → NAG → BLR 1 stop \n", "4 IndiGo Banglore New Delhi BLR → NAG → DEL 1 stop \n", "\n", " Additional_Info Price Journey_day Journey_month Dep_hour Dep_min \\\n", "0 No info 3897 24 3 22 20 \n", "1 No info 7662 1 5 5 50 \n", "2 No info 13882 9 6 9 25 \n", "3 No info 6218 12 5 18 5 \n", "4 No info 13302 1 3 16 50 \n", "\n", " Arrival_hour Arrival_min Duration_hours Duration_mins \n", "0 1 10 2 50 \n", "1 13 15 7 25 \n", "2 4 25 19 0 \n", "3 23 30 5 25 \n", "4 21 35 4 45 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AirlineSourceDestinationRouteTotal_StopsAdditional_InfoPriceJourney_dayJourney_monthDep_hourDep_minArrival_hourArrival_minDuration_hoursDuration_mins
0IndiGoBangloreNew DelhiBLR → DELnon-stopNo info38972432220110250
1Air IndiaKolkataBangloreCCU → IXR → BBI → BLR2 stopsNo info7662155501315725
2Jet AirwaysDelhiCochinDEL → LKO → BOM → COK2 stopsNo info1388296925425190
3IndiGoKolkataBangloreCCU → NAG → BLR1 stopNo info62181251852330525
4IndiGoBangloreNew DelhiBLR → NAG → DEL1 stopNo info133021316502135445
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "markdown", "source": [ "# Handling the categorical data" ], "metadata": { "id": "1792aAnuyjTW" } }, { "cell_type": "code", "source": [ "data[[\"Source\"]].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RYWKDHc0zZSm", "outputId": "20e148f5-b0f6-4476-c540-f3d382278351" }, "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Source \n", "Delhi 4345\n", "Kolkata 2860\n", "Banglore 2179\n", "Mumbai 697\n", "Chennai 381\n", "dtype: int64" ] }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "code", "source": [ "# Plotting Violin plot for Price vs Source\n", "sns.catplot(y = \"Price\", x = \"Source\", data = data.sort_values(\"Price\", ascending = False), kind=\"violin\", height = 4, aspect = 3)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 351 }, "id": "c6NzHmjSjgQx", "outputId": "3d7107ec-3a20-4ef2-e357-83687edee6a5" }, "execution_count": 22, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "# Plotting Price vs Airline plot\n", "sns.catplot(y = \"Price\", x = \"Airline\", data = data.sort_values(\"Price\", ascending = False), kind=\"boxen\", height = 8, aspect = 3)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 353 }, "id": "zeb_NtFTjI8q", "outputId": "00f946ee-2834-49a7-d9ea-fa59a3790f72" }, "execution_count": 23, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "Airline is a Nominal categorical data as we can't assign them any order" ], "metadata": { "id": "Qf0L_zdDzIe_" } }, { "cell_type": "code", "source": [ "Airline = pd.get_dummies(data[[\"Airline\"]], drop_first= True)\n", "Airline.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 305 }, "id": "2xElhraEyd0k", "outputId": "b3e03fa9-632e-4f13-aec8-913654e02ac0" }, "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Airline_Air India Airline_GoAir Airline_IndiGo Airline_Jet Airways \\\n", "0 0 0 1 0 \n", "1 1 0 0 0 \n", "2 0 0 0 1 \n", "3 0 0 1 0 \n", "4 0 0 1 0 \n", "\n", " Airline_Jet Airways Business Airline_Multiple carriers \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " Airline_Multiple carriers Premium economy Airline_SpiceJet \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " Airline_Trujet Airline_Vistara Airline_Vistara Premium economy \n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Airline_Air IndiaAirline_GoAirAirline_IndiGoAirline_Jet AirwaysAirline_Jet Airways BusinessAirline_Multiple carriersAirline_Multiple carriers Premium economyAirline_SpiceJetAirline_TrujetAirline_VistaraAirline_Vistara Premium economy
000100000000
110000000000
200010000000
300100000000
400100000000
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "markdown", "source": [ "Source is a Nominal categorical data as we can't assign them any order" ], "metadata": { "id": "Ylh9Wa6DzvbW" } }, { "cell_type": "code", "source": [ "Destination = pd.get_dummies(data[[\"Destination\"]], drop_first = True)\n", "Destination.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "pfPKhKlMz9ZD", "outputId": "84be5402-0c20-4d3f-a634-e2983aebaca8" }, "execution_count": 25, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Destination_Cochin Destination_Delhi Destination_Hyderabad \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 1 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Destination_Kolkata Destination_New Delhi \n", "0 0 1 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 1 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Destination_CochinDestination_DelhiDestination_HyderabadDestination_KolkataDestination_New Delhi
000001
100000
210000
300000
400001
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 25 } ] }, { "cell_type": "code", "source": [ "Source = pd.get_dummies(data[[\"Source\"]], drop_first= True)\n", "\n", "Source.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "d1m_vz9gzkAF", "outputId": "775ecc7a-e164-4540-ba0e-5ed2e8c2082a" }, "execution_count": 26, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Source_Chennai Source_Delhi Source_Kolkata Source_Mumbai\n", "0 0 0 0 0\n", "1 0 0 1 0\n", "2 0 1 0 0\n", "3 0 0 1 0\n", "4 0 0 0 0" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Source_ChennaiSource_DelhiSource_KolkataSource_Mumbai
00000
10010
20100
30010
40000
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 26 } ] }, { "cell_type": "code", "source": [ "data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 617 }, "id": "i0K1g-Si050x", "outputId": "4502f938-b39d-474e-bd49-fe83cec5afc4" }, "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Airline Source Destination Route Total_Stops \\\n", "0 IndiGo Banglore New Delhi BLR → DEL non-stop \n", "1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 2 stops \n", "2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 2 stops \n", "3 IndiGo Kolkata Banglore CCU → NAG → BLR 1 stop \n", "4 IndiGo Banglore New Delhi BLR → NAG → DEL 1 stop \n", "\n", " Additional_Info Price Journey_day Journey_month Dep_hour Dep_min \\\n", "0 No info 3897 24 3 22 20 \n", "1 No info 7662 1 5 5 50 \n", "2 No info 13882 9 6 9 25 \n", "3 No info 6218 12 5 18 5 \n", "4 No info 13302 1 3 16 50 \n", "\n", " Arrival_hour Arrival_min Duration_hours Duration_mins \n", "0 1 10 2 50 \n", "1 13 15 7 25 \n", "2 4 25 19 0 \n", "3 23 30 5 25 \n", "4 21 35 4 45 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AirlineSourceDestinationRouteTotal_StopsAdditional_InfoPriceJourney_dayJourney_monthDep_hourDep_minArrival_hourArrival_minDuration_hoursDuration_mins
0IndiGoBangloreNew DelhiBLR → DELnon-stopNo info38972432220110250
1Air IndiaKolkataBangloreCCU → IXR → BBI → BLR2 stopsNo info7662155501315725
2Jet AirwaysDelhiCochinDEL → LKO → BOM → COK2 stopsNo info1388296925425190
3IndiGoKolkataBangloreCCU → NAG → BLR1 stopNo info62181251852330525
4IndiGoBangloreNew DelhiBLR → NAG → DEL1 stopNo info133021316502135445
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "markdown", "source": [ "Source is a Nominal categorical data as we can't assign them any order" ], "metadata": { "id": "e7TfU-Ex0GXu" } }, { "cell_type": "code", "source": [ "# Additional_Info contains almost 80% no_info\n", "# Route and Total_Stops are related to each other\n", "\n", "data.drop([\"Route\", \"Additional_Info\"],axis = 1,inplace = True)" ], "metadata": { "id": "6K4HZyDQ0VUD" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "source": [ "data.replace({\"non-stop\": 0, \"1 stop\": 1, \"2 stops\": 2, \"3 stops\": 3, \"4 stops\": 4},inplace = True)" ], "metadata": { "id": "NKMlbZ8Q0dp-" }, "execution_count": 29, "outputs": [] }, { "cell_type": "code", "source": [ "#Now adding encoded columns to the dataframe\n", "data_encoded = pd.concat([data, Airline, Source, Destination], axis = 1)" ], "metadata": { "id": "1dxtdYdz09zL" }, "execution_count": 30, "outputs": [] }, { "cell_type": "code", "source": [ "data_encoded.drop([\"Airline\", \"Source\", \"Destination\"], axis = 1, inplace = True)" ], "metadata": { "id": "UzIoo0OL5f-7" }, "execution_count": 31, "outputs": [] }, { "cell_type": "code", "source": [ "data_encoded.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 317 }, "id": "cOA8Xw7h1XBm", "outputId": "9e883f83-fc0f-4815-bc28-6805f883e9de" }, "execution_count": 32, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Total_Stops Price Journey_day Journey_month Dep_hour Dep_min \\\n", "0 0 3897 24 3 22 20 \n", "1 2 7662 1 5 5 50 \n", "2 2 13882 9 6 9 25 \n", "3 1 6218 12 5 18 5 \n", "4 1 13302 1 3 16 50 \n", "\n", " Arrival_hour Arrival_min Duration_hours Duration_mins ... \\\n", "0 1 10 2 50 ... \n", "1 13 15 7 25 ... \n", "2 4 25 19 0 ... \n", "3 23 30 5 25 ... \n", "4 21 35 4 45 ... \n", "\n", " Airline_Vistara Premium economy Source_Chennai Source_Delhi \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 1 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Source_Kolkata Source_Mumbai Destination_Cochin Destination_Delhi \\\n", "0 0 0 0 0 \n", "1 1 0 0 0 \n", "2 0 0 1 0 \n", "3 1 0 0 0 \n", "4 0 0 0 0 \n", "\n", " Destination_Hyderabad Destination_Kolkata Destination_New Delhi \n", "0 0 0 1 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 1 \n", "\n", "[5 rows x 30 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Total_StopsPriceJourney_dayJourney_monthDep_hourDep_minArrival_hourArrival_minDuration_hoursDuration_mins...Airline_Vistara Premium economySource_ChennaiSource_DelhiSource_KolkataSource_MumbaiDestination_CochinDestination_DelhiDestination_HyderabadDestination_KolkataDestination_New Delhi
0038972432220110250...0000000001
127662155501315725...0001000000
221388296925425190...0010010000
3162181251852330525...0001000000
41133021316502135445...0000000001
\n", "

5 rows × 30 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 32 } ] }, { "cell_type": "markdown", "source": [ "There are 33 columns total so training will take much time so selecting some features" ], "metadata": { "id": "600IAmjT2gds" } }, { "cell_type": "code", "source": [ "X = data_encoded.loc[:, ['Total_Stops', 'Journey_day', 'Journey_month', 'Dep_hour',\n", " 'Dep_min', 'Arrival_hour', 'Arrival_min', 'Duration_hours',\n", " 'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',\n", " 'Airline_Jet Airways', 'Airline_Jet Airways Business',\n", " 'Airline_Multiple carriers',\n", " 'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',\n", " 'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',\n", " 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',\n", " 'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',\n", " 'Destination_Kolkata', 'Destination_New Delhi']]\n", "X.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 317 }, "id": "2tQ514tN1Z1B", "outputId": "a263007a-584f-4359-b066-83cc81aaca4c" }, "execution_count": 33, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Total_Stops Journey_day Journey_month Dep_hour Dep_min Arrival_hour \\\n", "0 0 24 3 22 20 1 \n", "1 2 1 5 5 50 13 \n", "2 2 9 6 9 25 4 \n", "3 1 12 5 18 5 23 \n", "4 1 1 3 16 50 21 \n", "\n", " Arrival_min Duration_hours Duration_mins Airline_Air India ... \\\n", "0 10 2 50 0 ... \n", "1 15 7 25 1 ... \n", "2 25 19 0 0 ... \n", "3 30 5 25 0 ... \n", "4 35 4 45 0 ... \n", "\n", " Airline_Vistara Premium economy Source_Chennai Source_Delhi \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 1 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Source_Kolkata Source_Mumbai Destination_Cochin Destination_Delhi \\\n", "0 0 0 0 0 \n", "1 1 0 0 0 \n", "2 0 0 1 0 \n", "3 1 0 0 0 \n", "4 0 0 0 0 \n", "\n", " Destination_Hyderabad Destination_Kolkata Destination_New Delhi \n", "0 0 0 1 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 1 \n", "\n", "[5 rows x 29 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Total_StopsJourney_dayJourney_monthDep_hourDep_minArrival_hourArrival_minDuration_hoursDuration_minsAirline_Air India...Airline_Vistara Premium economySource_ChennaiSource_DelhiSource_KolkataSource_MumbaiDestination_CochinDestination_DelhiDestination_HyderabadDestination_KolkataDestination_New Delhi
0024322201102500...0000000001
121555013157251...0001000000
22969254251900...0010010000
3112518523305250...0001000000
4113165021354450...0000000001
\n", "

5 rows × 29 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 33 } ] }, { "cell_type": "markdown", "source": [ "# Implementing the model" ], "metadata": { "id": "dvzlsYIa4I5X" } }, { "cell_type": "markdown", "source": [ "## Decision Trees" ], "metadata": { "id": "lU3jMty0YHil" } }, { "cell_type": "code", "source": [ "y = data_encoded.iloc[:, 1]\n", "y.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PD85rRw_nhwA", "outputId": "b85e6623-336d-4a42-b902-2a10de6df0a9" }, "execution_count": 34, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 3897\n", "1 7662\n", "2 13882\n", "3 6218\n", "4 13302\n", "Name: Price, dtype: int64" ] }, "metadata": {}, "execution_count": 34 } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)" ], "metadata": { "id": "SIqacpuJncsJ" }, "execution_count": 35, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.tree import DecisionTreeRegressor\n", "dtr=DecisionTreeRegressor()\n", "dtr.fit(X_train,y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "UeA4wMJZYJv-", "outputId": "fa2c28e5-b139-41a0-9712-ba4b3fc7d934" }, "execution_count": 36, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "DecisionTreeRegressor()" ], "text/html": [ "
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 36 } ] }, { "cell_type": "code", "source": [ "y_dtc_pred =dtr.predict(X_test)" ], "metadata": { "id": "pBKYvSEgYanB" }, "execution_count": 37, "outputs": [] }, { "cell_type": "code", "source": [ "sns.distplot(y_dtc_pred)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 626 }, "id": "qhXiiz-xiR1m", "outputId": "8f9a6995-cc4e-4ae2-9029-cc9386caeefd" }, "execution_count": 38, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " sns.distplot(y_dtc_pred)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "plt.scatter(y_test, y_dtc_pred, alpha = 0.5)\n", "plt.xlabel(\"y_test\")\n", "plt.ylabel(\"y_pred\")\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "id": "SMhKm0jciTan", "outputId": "463fd59c-b572-47b4-c6c1-c2dc63b9d307" }, "execution_count": 39, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "from sklearn import metrics\n", "metrics.r2_score(y_test, y_dtc_pred)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_JldEFL2Yl68", "outputId": "bc8b8cf1-6018-4421-ad7b-76e1a06a9792" }, "execution_count": 40, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.708256405913855" ] }, "metadata": {}, "execution_count": 40 } ] }, { "cell_type": "markdown", "source": [ "### HyperParameter Tuning" ], "metadata": { "id": "SdcVyCO3Yotw" } }, { "cell_type": "code", "source": [ "max_depth=[5,10,15,20]\n", "ccp_alpha=[0.001,0.05,0.1]\n", "max_features=[5,10,15,20]" ], "metadata": { "id": "YQmGSv4WYr6H" }, "execution_count": 41, "outputs": [] }, { "cell_type": "code", "source": [ "decision_grid = {'max_depth':max_depth,\n", " 'ccp_alpha':ccp_alpha,\n", " 'max_features':max_features}" ], "metadata": { "id": "TMr9h9DDY6os" }, "execution_count": 42, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import RandomizedSearchCV" ], "metadata": { "id": "beQvcwV9n6s_" }, "execution_count": 43, "outputs": [] }, { "cell_type": "code", "source": [ "decision_random = RandomizedSearchCV(estimator = dtr, param_distributions = decision_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42)" ], "metadata": { "id": "Cov1Cp0iZKZT" }, "execution_count": 44, "outputs": [] }, { "cell_type": "code", "source": [ "decision_random.fit(X_train,y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "-7Mcth-4ZZwB", "outputId": "81cc85b5-edeb-425c-dbc8-cea7dad0f98e" }, "execution_count": 45, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Fitting 5 folds for each of 10 candidates, totalling 50 fits\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END ........ccp_alpha=0.1, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END ........ccp_alpha=0.1, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END ........ccp_alpha=0.1, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END ........ccp_alpha=0.1, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END ........ccp_alpha=0.1, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=15; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=15; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=15; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=15; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=15; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=15, max_features=20; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=15, max_features=5; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=10, max_features=10; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=10, max_features=10; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=10, max_features=10; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=10, max_features=10; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.1, max_depth=10, max_features=10; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=20, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=20, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=20, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=20, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=20, max_features=5; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=5, max_features=20; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=5, max_features=20; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=5, max_features=20; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=5, max_features=20; total time= 0.0s\n", "[CV] END .......ccp_alpha=0.05, max_depth=5, max_features=20; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=10, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=10, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=10, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=10, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.001, max_depth=10, max_features=5; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=10; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=10; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=10; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=10; total time= 0.0s\n", "[CV] END ......ccp_alpha=0.05, max_depth=15, max_features=10; total time= 0.0s\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(),\n", " param_distributions={'ccp_alpha': [0.001, 0.05, 0.1],\n", " 'max_depth': [5, 10, 15, 20],\n", " 'max_features': [5, 10, 15, 20]},\n", " random_state=42, scoring='neg_mean_squared_error',\n", " verbose=2)" ], "text/html": [ "
RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(),\n",
              "                   param_distributions={'ccp_alpha': [0.001, 0.05, 0.1],\n",
              "                                        'max_depth': [5, 10, 15, 20],\n",
              "                                        'max_features': [5, 10, 15, 20]},\n",
              "                   random_state=42, scoring='neg_mean_squared_error',\n",
              "                   verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 45 } ] }, { "cell_type": "code", "source": [ "decision_random.best_params_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fvjHGfBpZm97", "outputId": "56a6aa86-b966-4213-f794-16f9c5316032" }, "execution_count": 46, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'max_features': 20, 'max_depth': 15, 'ccp_alpha': 0.1}" ] }, "metadata": {}, "execution_count": 46 } ] }, { "cell_type": "code", "source": [ "dtr_best=DecisionTreeRegressor(ccp_alpha= 0.1, max_depth= 15, max_features= 20)\n", "dtr_best.fit(X_train,y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "HGK59xvSZd2w", "outputId": "4d41ac5d-aaae-4b43-8798-eb67cab1d724" }, "execution_count": 47, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "DecisionTreeRegressor(ccp_alpha=0.1, max_depth=15, max_features=20)" ], "text/html": [ "
DecisionTreeRegressor(ccp_alpha=0.1, max_depth=15, max_features=20)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 47 } ] }, { "cell_type": "code", "source": [ "y_dtr_prediction = dtr_best.predict(X_test)" ], "metadata": { "id": "LMkVlcOraAFc" }, "execution_count": 48, "outputs": [] }, { "cell_type": "code", "source": [ "metrics.r2_score(y_test, y_dtr_prediction)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uLFeksmRaLY3", "outputId": "6c2d1ad2-c433-4198-9f41-026d52731e10" }, "execution_count": 49, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.7403542706530073" ] }, "metadata": {}, "execution_count": 49 } ] }, { "cell_type": "markdown", "source": [ "## Random Forest" ], "metadata": { "id": "MDJcd_IxDhnA" } }, { "cell_type": "code", "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "reg_rf = RandomForestRegressor()\n", "reg_rf.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "VTsc0_i05MZe", "outputId": "2ea944d6-8832-4717-a964-c919254443d8" }, "execution_count": 50, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "RandomForestRegressor()" ], "text/html": [ "
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 50 } ] }, { "cell_type": "code", "source": [ "y_pred = reg_rf.predict(X_test)" ], "metadata": { "id": "-lPqg4dR5NCc" }, "execution_count": 51, "outputs": [] }, { "cell_type": "code", "source": [ "reg_rf.score(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "M0UVMbNL502p", "outputId": "c9269ebd-a4a0-497f-99c6-5cb4c24c8891" }, "execution_count": 52, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9532995307064535" ] }, "metadata": {}, "execution_count": 52 } ] }, { "cell_type": "code", "source": [ "reg_rf.score(X_test,y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p1avttHX54FI", "outputId": "63a2d9c5-4b17-4d5a-a109-94df584f6509" }, "execution_count": 53, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8117505754702412" ] }, "metadata": {}, "execution_count": 53 } ] }, { "cell_type": "code", "source": [ "sns.distplot(y_test-y_pred)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 645 }, "id": "_o5FcrE857VH", "outputId": "5fccd7ec-ab9f-4a02-dcae-effb4ec81115" }, "execution_count": 54, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " sns.distplot(y_test-y_pred)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "plt.scatter(y_test, y_pred, alpha = 0.5)\n", "plt.xlabel(\"y_test\")\n", "plt.ylabel(\"y_pred\")\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "id": "6fP7XmL96VCE", "outputId": "c2e76b5a-5bc5-4d61-f5a5-4f3d82192f5f" }, "execution_count": 55, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "from sklearn import metrics" ], "metadata": { "id": "Bp9P9cSG6YR_" }, "execution_count": 56, "outputs": [] }, { "cell_type": "code", "source": [ "print('Mean Absolute ERROR:', metrics.mean_absolute_error(y_test, y_pred))\n", "print('Mean Square Error:', metrics.mean_squared_error(y_test, y_pred))\n", "print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OV6YUFWO6bW5", "outputId": "580d6634-7b3e-4b95-b623-60888ba78290" }, "execution_count": 57, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mean Absolute ERROR: 1187.2347040835286\n", "Mean Square Error: 3925052.529427591\n", "RMSE: 1981.1745328031022\n" ] } ] }, { "cell_type": "code", "source": [ "2090.5509/(max(y)-min(y))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dRgFx0UN6c95", "outputId": "6d5d22b9-2530-409d-ec0d-54161598924e" }, "execution_count": 58, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.026887077025966846" ] }, "metadata": {}, "execution_count": 58 } ] }, { "cell_type": "code", "source": [ "metrics.r2_score(y_test, y_pred)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RpKnNeSA6i1z", "outputId": "bf1c9bdb-0781-468e-bb80-a4a433c9d15e" }, "execution_count": 59, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8117505754702412" ] }, "metadata": {}, "execution_count": 59 } ] }, { "cell_type": "markdown", "source": [ "### HyperParamter Tuning" ], "metadata": { "id": "xH84BzBk6phJ" } }, { "cell_type": "code", "source": [ "from sklearn.model_selection import RandomizedSearchCV" ], "metadata": { "id": "7Uf8x4lz6krL" }, "execution_count": 60, "outputs": [] }, { "cell_type": "code", "source": [ "n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]\n", "# Number of features to consider at every split\n", "max_features = ['auto', 'sqrt']\n", "# Maximum number of levels in tree\n", "max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]\n", "# Minimum number of samples required to split a node\n", "min_samples_split = [2, 5, 10, 15, 100]\n", "# Minimum number of samples required at each leaf node\n", "min_samples_leaf = [1, 2, 5, 10]" ], "metadata": { "id": "6qYNhXUp6wMa" }, "execution_count": 61, "outputs": [] }, { "cell_type": "code", "source": [ "random_grid = {'n_estimators': n_estimators,\n", " 'max_features': max_features,\n", " 'max_depth': max_depth,\n", " 'min_samples_split': min_samples_split,\n", " 'min_samples_leaf': min_samples_leaf}" ], "metadata": { "id": "8Mz6Z5dO6ybX" }, "execution_count": 62, "outputs": [] }, { "cell_type": "code", "source": [ "rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42)" ], "metadata": { "id": "XR05V5q660jC" }, "execution_count": 63, "outputs": [] }, { "cell_type": "code", "source": [ "rf_random.fit(X_train,y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "mZ7J3sct6-uq", "outputId": "40ea3ced-9a32-4cd6-d47f-57954e598db6" }, "execution_count": 64, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Fitting 5 folds for each of 10 candidates, totalling 50 fits\n", "[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 4.2s\n", "[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 4.1s\n", "[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 5.7s\n", "[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 4.3s\n", "[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 4.2s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 7.8s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 7.0s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 9.4s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 6.4s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 7.9s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 3.7s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 4.4s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 4.8s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 3.9s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 3.9s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 8.3s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 7.3s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 8.1s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 7.8s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 7.7s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 12.1s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 12.2s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 11.8s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 11.3s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 12.3s\n", "[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 11.4s\n", "[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 13.1s\n", "[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 11.1s\n", "[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 9.9s\n", "[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 11.0s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 3.5s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 4.8s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 3.4s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 3.5s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 4.0s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 2.7s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 1.7s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 1.6s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 1.6s\n", "[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 1.7s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 2.1s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 2.7s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 3.3s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 2.1s\n", "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 2.3s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 14.5s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 15.0s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 14.5s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 14.6s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 15.3s\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),\n", " param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],\n", " 'max_features': ['auto', 'sqrt'],\n", " 'min_samples_leaf': [1, 2, 5, 10],\n", " 'min_samples_split': [2, 5, 10, 15,\n", " 100],\n", " 'n_estimators': [100, 200, 300, 400,\n", " 500, 600, 700, 800,\n", " 900, 1000, 1100,\n", " 1200]},\n", " random_state=42, scoring='neg_mean_squared_error',\n", " verbose=2)" ], "text/html": [ "
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),\n",
              "                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],\n",
              "                                        'max_features': ['auto', 'sqrt'],\n",
              "                                        'min_samples_leaf': [1, 2, 5, 10],\n",
              "                                        'min_samples_split': [2, 5, 10, 15,\n",
              "                                                              100],\n",
              "                                        'n_estimators': [100, 200, 300, 400,\n",
              "                                                         500, 600, 700, 800,\n",
              "                                                         900, 1000, 1100,\n",
              "                                                         1200]},\n",
              "                   random_state=42, scoring='neg_mean_squared_error',\n",
              "                   verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 64 } ] }, { "cell_type": "code", "source": [ "rf_random.best_params_" ], "metadata": { "id": "tA_LTkOf7AYr", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "b9df903a-8725-4d3b-9fb3-939c28d7f006" }, "execution_count": 65, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'n_estimators': 700,\n", " 'min_samples_split': 15,\n", " 'min_samples_leaf': 1,\n", " 'max_features': 'auto',\n", " 'max_depth': 20}" ] }, "metadata": {}, "execution_count": 65 } ] }, { "cell_type": "code", "source": [ "rf_best=RandomForestRegressor(max_depth= 20,\n", " max_features= 'auto',\n", " min_samples_leaf= 1,\n", " min_samples_split= 15,\n", " n_estimators= 700)" ], "metadata": { "id": "K4NvR2aJB7EZ" }, "execution_count": 66, "outputs": [] }, { "cell_type": "code", "source": [ "rf_best.fit(X_train,y_train)" ], "metadata": { "id": "CAnNo1BJCoxn", "colab": { "base_uri": "https://localhost:8080/", "height": 148 }, "outputId": "465a69c8-8841-49db-ea4a-9468620ca788" }, "execution_count": 67, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:413: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.\n", " warn(\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "RandomForestRegressor(max_depth=20, max_features='auto', min_samples_split=15,\n", " n_estimators=700)" ], "text/html": [ "
RandomForestRegressor(max_depth=20, max_features='auto', min_samples_split=15,\n",
              "                      n_estimators=700)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 67 } ] }, { "cell_type": "code", "source": [ "y_prediction = rf_best.predict(X_test)" ], "metadata": { "id": "65JFsMZVDGxL" }, "execution_count": 68, "outputs": [] }, { "cell_type": "code", "source": [ "metrics.r2_score(y_test, y_prediction)" ], "metadata": { "id": "6FtJipk1DKSN", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "d79b2e83-886a-456a-8500-79044b75b609" }, "execution_count": 69, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8319550259102328" ] }, "metadata": {}, "execution_count": 69 } ] }, { "cell_type": "markdown", "source": [ "## XGboost" ], "metadata": { "id": "CTmS2BEaD4YF" } }, { "cell_type": "code", "source": [ "import xgboost as xg" ], "metadata": { "id": "YSNQmzyRDRPP" }, "execution_count": 70, "outputs": [] }, { "cell_type": "code", "source": [ "xgb_r = xg.XGBRegressor(n_estimators = 10, seed = 123)" ], "metadata": { "id": "xYY231qMEJdN" }, "execution_count": 71, "outputs": [] }, { "cell_type": "code", "source": [ "xgb_r.fit(X_train, y_train)" ], "metadata": { "id": "Vrz9CGfsEmA7", "colab": { "base_uri": "https://localhost:8080/", "height": 248 }, "outputId": "d34cdf86-7548-4a09-9f1c-dffd2c6357f9" }, "execution_count": 72, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=None, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=None, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " n_estimators=10, n_jobs=None, num_parallel_tree=None,\n", " predictor=None, random_state=None, ...)" ], "text/html": [ "
XGBRegressor(base_score=None, booster=None, callbacks=None,\n",
              "             colsample_bylevel=None, colsample_bynode=None,\n",
              "             colsample_bytree=None, early_stopping_rounds=None,\n",
              "             enable_categorical=False, eval_metric=None, feature_types=None,\n",
              "             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
              "             interaction_constraints=None, learning_rate=None, max_bin=None,\n",
              "             max_cat_threshold=None, max_cat_to_onehot=None,\n",
              "             max_delta_step=None, max_depth=None, max_leaves=None,\n",
              "             min_child_weight=None, missing=nan, monotone_constraints=None,\n",
              "             n_estimators=10, n_jobs=None, num_parallel_tree=None,\n",
              "             predictor=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 72 } ] }, { "cell_type": "code", "source": [ "pred_xgb=xgb_r.predict(X_test)" ], "metadata": { "id": "oKvbjzC4Exf2" }, "execution_count": 73, "outputs": [] }, { "cell_type": "code", "source": [ "sns.distplot(y_test-pred_xgb)\n", "plt.show()" ], "metadata": { "id": "FCPBEUVZkzbW", "colab": { "base_uri": "https://localhost:8080/", "height": 645 }, "outputId": "4723bd9c-546d-4571-fcd1-e23c0d33ca1f" }, "execution_count": 74, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " sns.distplot(y_test-pred_xgb)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "plt.scatter(y_test, pred_xgb, alpha = 0.5)\n", "plt.xlabel(\"y_test\")\n", "plt.ylabel(\"y_pred\")\n", "plt.show()" ], "metadata": { "id": "L-y9SJzhk-gq", "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "outputId": "3b5bf0ac-97ac-4aca-d6bb-4ccee89fca76" }, "execution_count": 75, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "print('MAE:', metrics.mean_absolute_error(y_test, y_pred))\n", "print('MSE:', metrics.mean_squared_error(y_test, y_pred))\n", "print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))" ], "metadata": { "id": "A_ZVWOzwEoh1", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "0d9ea5a0-a715-4ef2-f60d-749fde8a3e04" }, "execution_count": 76, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "MAE: 1187.2347040835286\n", "MSE: 3925052.529427591\n", "RMSE: 1981.1745328031022\n" ] } ] }, { "cell_type": "markdown", "source": [ "### HyperParameter Tuning" ], "metadata": { "id": "Ht9JT7jJE7wS" } }, { "cell_type": "code", "source": [ "max_depth=[5,10,15,20]\n", "n_estimators=[100,500,1000]\n", "n_jobs=[2,5,8,10]\n", "learning_rate=[0.005,0.01,0.05,0.1,1]" ], "metadata": { "id": "7mn6sF38E2E5" }, "execution_count": 77, "outputs": [] }, { "cell_type": "code", "source": [ "XGB_grid = {'max_depth': max_depth,\n", " 'n_estimators': n_estimators,\n", " 'max_depth': max_depth,\n", " 'learning_rate':learning_rate}" ], "metadata": { "id": "iMYYTVyaFrZM" }, "execution_count": 78, "outputs": [] }, { "cell_type": "code", "source": [ "XGB_random = RandomizedSearchCV(estimator = xgb_r, param_distributions = XGB_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42)" ], "metadata": { "id": "YMcxKxsOF5EM" }, "execution_count": 79, "outputs": [] }, { "cell_type": "code", "source": [ "XGB_random.fit(X_train,y_train)" ], "metadata": { "id": "ooX8BQ0DGG2v", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "5b6ea824-ce25-4a12-c3cb-bb7579284e37" }, "execution_count": 80, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Fitting 5 folds for each of 10 candidates, totalling 50 fits\n", "[CV] END .learning_rate=0.005, max_depth=5, n_estimators=100; total time= 0.9s\n", "[CV] END .learning_rate=0.005, max_depth=5, n_estimators=100; total time= 0.9s\n", "[CV] END .learning_rate=0.005, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END .learning_rate=0.005, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END .learning_rate=0.005, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END learning_rate=0.005, max_depth=10, n_estimators=1000; total time= 24.0s\n", "[CV] END learning_rate=0.005, max_depth=10, n_estimators=1000; total time= 26.4s\n", "[CV] END learning_rate=0.005, max_depth=10, n_estimators=1000; total time= 23.7s\n", "[CV] END learning_rate=0.005, max_depth=10, n_estimators=1000; total time= 21.5s\n", "[CV] END learning_rate=0.005, max_depth=10, n_estimators=1000; total time= 23.6s\n", "[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=100; total time= 3.6s\n", "[CV] END ..learning_rate=0.1, max_depth=20, n_estimators=100; total time= 4.7s\n", "[CV] END ..learning_rate=0.1, max_depth=20, n_estimators=100; total time= 4.8s\n", "[CV] END ..learning_rate=0.1, max_depth=20, n_estimators=100; total time= 7.3s\n", "[CV] END ..learning_rate=0.1, max_depth=20, n_estimators=100; total time= 4.7s\n", "[CV] END ..learning_rate=0.1, max_depth=20, n_estimators=100; total time= 7.4s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=500; total time= 3.9s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=500; total time= 6.7s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=500; total time= 3.9s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=500; total time= 3.9s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=500; total time= 6.6s\n", "[CV] END ....learning_rate=1, max_depth=15, n_estimators=100; total time= 1.9s\n", "[CV] END ....learning_rate=1, max_depth=15, n_estimators=100; total time= 1.6s\n", "[CV] END ....learning_rate=1, max_depth=15, n_estimators=100; total time= 1.6s\n", "[CV] END ....learning_rate=1, max_depth=15, n_estimators=100; total time= 2.0s\n", "[CV] END ....learning_rate=1, max_depth=15, n_estimators=100; total time= 4.8s\n", "[CV] END .learning_rate=0.05, max_depth=20, n_estimators=100; total time= 4.0s\n", "[CV] END .learning_rate=0.05, max_depth=20, n_estimators=100; total time= 4.0s\n", "[CV] END .learning_rate=0.05, max_depth=20, n_estimators=100; total time= 6.7s\n", "[CV] END .learning_rate=0.05, max_depth=20, n_estimators=100; total time= 4.0s\n", "[CV] END .learning_rate=0.05, max_depth=20, n_estimators=100; total time= 4.0s\n", "[CV] END .....learning_rate=1, max_depth=5, n_estimators=100; total time= 3.6s\n", "[CV] END .....learning_rate=1, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END .....learning_rate=1, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END .....learning_rate=1, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END .....learning_rate=1, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 0.8s\n", "[CV] END ....learning_rate=1, max_depth=20, n_estimators=100; total time= 1.6s\n", "[CV] END ....learning_rate=1, max_depth=20, n_estimators=100; total time= 4.3s\n", "[CV] END ....learning_rate=1, max_depth=20, n_estimators=100; total time= 1.9s\n", "[CV] END ....learning_rate=1, max_depth=20, n_estimators=100; total time= 1.7s\n", "[CV] END ....learning_rate=1, max_depth=20, n_estimators=100; total time= 2.0s\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "RandomizedSearchCV(cv=5,\n", " estimator=XGBRegressor(base_score=None, booster=None,\n", " callbacks=None,\n", " colsample_bylevel=None,\n", " colsample_bynode=None,\n", " colsample_bytree=None,\n", " early_stopping_rounds=None,\n", " enable_categorical=False,\n", " eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None,\n", " grow_policy=None,\n", " importance_type=None,\n", " interaction_constraints=None,\n", " learning_rate=...\n", " max_delta_step=None, max_depth=None,\n", " max_leaves=None,\n", " min_child_weight=None, missing=nan,\n", " monotone_constraints=None,\n", " n_estimators=10, n_jobs=None,\n", " num_parallel_tree=None,\n", " predictor=None, random_state=None, ...),\n", " param_distributions={'learning_rate': [0.005, 0.01, 0.05,\n", " 0.1, 1],\n", " 'max_depth': [5, 10, 15, 20],\n", " 'n_estimators': [100, 500, 1000]},\n", " random_state=42, scoring='neg_mean_squared_error',\n", " verbose=2)" ], "text/html": [ "
RandomizedSearchCV(cv=5,\n",
              "                   estimator=XGBRegressor(base_score=None, booster=None,\n",
              "                                          callbacks=None,\n",
              "                                          colsample_bylevel=None,\n",
              "                                          colsample_bynode=None,\n",
              "                                          colsample_bytree=None,\n",
              "                                          early_stopping_rounds=None,\n",
              "                                          enable_categorical=False,\n",
              "                                          eval_metric=None, feature_types=None,\n",
              "                                          gamma=None, gpu_id=None,\n",
              "                                          grow_policy=None,\n",
              "                                          importance_type=None,\n",
              "                                          interaction_constraints=None,\n",
              "                                          learning_rate=...\n",
              "                                          max_delta_step=None, max_depth=None,\n",
              "                                          max_leaves=None,\n",
              "                                          min_child_weight=None, missing=nan,\n",
              "                                          monotone_constraints=None,\n",
              "                                          n_estimators=10, n_jobs=None,\n",
              "                                          num_parallel_tree=None,\n",
              "                                          predictor=None, random_state=None, ...),\n",
              "                   param_distributions={'learning_rate': [0.005, 0.01, 0.05,\n",
              "                                                          0.1, 1],\n",
              "                                        'max_depth': [5, 10, 15, 20],\n",
              "                                        'n_estimators': [100, 500, 1000]},\n",
              "                   random_state=42, scoring='neg_mean_squared_error',\n",
              "                   verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 80 } ] }, { "cell_type": "code", "source": [ "XGB_random.best_params_" ], "metadata": { "id": "yJpKjfXQGeB1", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "15fb3883-9b9c-48be-961e-659fca50631a" }, "execution_count": 81, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.005}" ] }, "metadata": {}, "execution_count": 81 } ] }, { "cell_type": "code", "source": [ "xgb_best=xg.XGBRegressor(learning_rate= 0.005, max_depth= 10, n_estimators= 1000,state=42,objectvie='reg:squarederror')" ], "metadata": { "id": "-sDneAxnHSxR" }, "execution_count": 82, "outputs": [] }, { "cell_type": "code", "source": [ "xgb_best.fit(X_train,y_train)" ], "metadata": { "id": "WNszDfEgH4Ej", "colab": { "base_uri": "https://localhost:8080/", "height": 302 }, "outputId": "3d1863ea-b66e-4956-d22c-df5943fdeb31" }, "execution_count": 83, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[07:04:11] WARNING: ../src/learner.cc:767: \n", "Parameters: { \"objectvie\", \"state\" } are not used.\n", "\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.005, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=10, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " n_estimators=1000, n_jobs=None, num_parallel_tree=None,\n", " objectvie='reg:squarederror', predictor=None, ...)" ], "text/html": [ "
XGBRegressor(base_score=None, booster=None, callbacks=None,\n",
              "             colsample_bylevel=None, colsample_bynode=None,\n",
              "             colsample_bytree=None, early_stopping_rounds=None,\n",
              "             enable_categorical=False, eval_metric=None, feature_types=None,\n",
              "             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
              "             interaction_constraints=None, learning_rate=0.005, max_bin=None,\n",
              "             max_cat_threshold=None, max_cat_to_onehot=None,\n",
              "             max_delta_step=None, max_depth=10, max_leaves=None,\n",
              "             min_child_weight=None, missing=nan, monotone_constraints=None,\n",
              "             n_estimators=1000, n_jobs=None, num_parallel_tree=None,\n",
              "             objectvie='reg:squarederror', predictor=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 83 } ] }, { "cell_type": "code", "source": [ "y_prediction_xgb = xgb_best.predict(X_test)" ], "metadata": { "id": "cxM78lGNHblm" }, "execution_count": 84, "outputs": [] }, { "cell_type": "code", "source": [ "metrics.r2_score(y_test, y_prediction_xgb)" ], "metadata": { "id": "J8tABRW-H0Fq", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "168c4b22-11a9-4d95-a2ef-ecf8dc571bc3" }, "execution_count": 85, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8401304475145384" ] }, "metadata": {}, "execution_count": 85 } ] }, { "cell_type": "markdown", "source": [ "## Linear Regression" ], "metadata": { "id": "MvI2_OMkWaHg" } }, { "cell_type": "code", "source": [ "from sklearn.linear_model import LinearRegression" ], "metadata": { "id": "xdpGBg1KWIKd" }, "execution_count": 86, "outputs": [] }, { "cell_type": "code", "source": [ "lr=LinearRegression()\n", "lr.fit(X_train,y_train)" ], "metadata": { "id": "xASxHV4lXf83", "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "outputId": "f7df8f6b-334e-441a-97d2-73d7615228be" }, "execution_count": 87, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LinearRegression()" ], "text/html": [ "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 87 } ] }, { "cell_type": "code", "source": [ "lr_pred=lr.predict(X_test)" ], "metadata": { "id": "Mux4xF-vXnYF" }, "execution_count": 88, "outputs": [] }, { "cell_type": "code", "source": [ "sns.distplot(lr_pred)\n", "plt.show()" ], "metadata": { "id": "P5N7DZaeiejW", "colab": { "base_uri": "https://localhost:8080/", "height": 626 }, "outputId": "86b09be7-e921-44f1-bfb0-f15e6d268356" }, "execution_count": 89, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: UserWarning: \n", "\n", "`distplot` is a deprecated function and will be removed in seaborn v0.14.0.\n", "\n", "Please adapt your code to use either `displot` (a figure-level function with\n", "similar flexibility) or `histplot` (an axes-level function for histograms).\n", "\n", "For a guide to updating your code to use the new functions, please see\n", "https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751\n", "\n", " sns.distplot(lr_pred)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "plt.scatter(y_test,lr_pred, alpha = 0.5)\n", "plt.xlabel(\"y_test\")\n", "plt.ylabel(\"y_pred\")\n", "plt.show()" ], "metadata": { "id": "53lHUweOidoi", "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "outputId": "836634d7-d151-4018-9714-c842b882137e" }, "execution_count": 90, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "metrics.r2_score(y_test, lr_pred)" ], "metadata": { "id": "yL_Zc2h5XxtJ", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "846e10c8-19f2-4e9f-d59f-3d0ab7fa5204" }, "execution_count": 91, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.5837544362210152" ] }, "metadata": {}, "execution_count": 91 } ] }, { "cell_type": "code", "source": [ "import joblib\n", "joblib.dump(xgb_r, 'flight_price_model.pkl')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PjHGoNM9PSM5", "outputId": "7906485d-0ddb-4f00-ae3d-23c2180fbad4" }, "execution_count": 93, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['flight_price_model.pkl']" ] }, "metadata": {}, "execution_count": 93 } ] } ] }